diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 12 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kvm/assigned-dev.c | 1058 | ||||
-rw-r--r-- | arch/x86/kvm/assigned-dev.h | 32 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 23 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 19 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 110 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 75 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 31 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.h | 16 | ||||
-rw-r--r-- | arch/x86/kvm/iommu.c | 356 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 32 | ||||
-rw-r--r-- | arch/x86/kvm/irq_comm.c | 50 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 147 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 181 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/page_track.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 87 | ||||
-rw-r--r-- | arch/x86/kvm/pmu_intel.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 147 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 734 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 392 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 36 |
26 files changed, 1195 insertions, 2376 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ab8e32f7b9a8..760433b2574a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -86,18 +86,6 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. -config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support (DEPRECATED)" - depends on KVM && PCI && IOMMU_API - default n - ---help--- - Provide support for legacy PCI device assignment through KVM. The - kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes this support and provides - better security. - - If unsure, say N. - # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 3bff20710471..09d4b17be022 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -15,8 +15,6 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o - kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c deleted file mode 100644 index 308b8597c691..000000000000 --- a/arch/x86/kvm/assigned-dev.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * Kernel-based Virtual Machine - device assignment support - * - * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> -#include <linux/errno.h> -#include <linux/spinlock.h> -#include <linux/pci.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/namei.h> -#include <linux/fs.h> -#include "irq.h" -#include "assigned-dev.h" -#include "trace/events/kvm.h" - -struct kvm_assigned_dev_kernel { - struct kvm_irq_ack_notifier ack_notifier; - struct list_head list; - int assigned_dev_id; - int host_segnr; - int host_busnr; - int host_devfn; - unsigned int entries_nr; - int host_irq; - bool host_irq_disabled; - bool pci_2_3; - struct msix_entry *host_msix_entries; - int guest_irq; - struct msix_entry *guest_msix_entries; - unsigned long irq_requested_type; - int irq_source_id; - int flags; - struct pci_dev *dev; - struct kvm *kvm; - spinlock_t intx_lock; - spinlock_t intx_mask_lock; - char irq_name[32]; - struct pci_saved_state *pci_saved_state; -}; - -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct kvm_assigned_dev_kernel *match; - - list_for_each_entry(match, head, list) { - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static int find_index_from_host_irq(struct kvm_assigned_dev_kernel - *assigned_dev, int irq) -{ - int i, index; - struct msix_entry *host_msix_entries; - - host_msix_entries = assigned_dev->host_msix_entries; - - index = -1; - for (i = 0; i < assigned_dev->entries_nr; i++) - if (irq == host_msix_entries[i].vector) { - index = i; - break; - } - if (index < 0) - printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); - - return index; -} - -static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret; - - spin_lock(&assigned_dev->intx_lock); - if (pci_check_and_mask_intx(assigned_dev->dev)) { - assigned_dev->host_irq_disabled = true; - ret = IRQ_WAKE_THREAD; - } else - ret = IRQ_NONE; - spin_unlock(&assigned_dev->intx_lock); - - return ret; -} - -static void -kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, - int vector) -{ - if (unlikely(assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_INTX)) { - spin_lock(&assigned_dev->intx_mask_lock); - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1, - false); - spin_unlock(&assigned_dev->intx_mask_lock); - } else - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1, false); -} - -static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - spin_unlock_irq(&assigned_dev->intx_lock); - } - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, - int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - - -static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - int ret = 0; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - vector, 1); - } - - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); - } - - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev = - container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); - - spin_lock(&dev->intx_mask_lock); - - if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { - bool reassert = false; - - spin_lock_irq(&dev->intx_lock); - /* - * The guest IRQ may be shared so this ack can come from an - * IRQ for another guest device. - */ - if (dev->host_irq_disabled) { - if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) - enable_irq(dev->host_irq); - else if (!pci_check_and_unmask_intx(dev->dev)) - reassert = true; - dev->host_irq_disabled = reassert; - } - spin_unlock_irq(&dev->intx_lock); - - if (reassert) - kvm_set_irq(dev->kvm, dev->irq_source_id, - dev->guest_irq, 1, false); - } - - spin_unlock(&dev->intx_mask_lock); -} - -static void deassign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - if (assigned_dev->ack_notifier.gsi != -1) - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev->ack_notifier); - - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 0, false); - - if (assigned_dev->irq_source_id != -1) - kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); - assigned_dev->irq_source_id = -1; - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); -} - -/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ -static void deassign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - /* - * We disable irq here to prevent further events. - * - * Notice this maybe result in nested disable if the interrupt type is - * INTx, but it's OK for we are going to free it. - * - * If this function is a part of VM destroy, please ensure that till - * now, the kvm state is still legal for probably we also have to wait - * on a currently running IRQ handler. - */ - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int i; - for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq(assigned_dev->host_msix_entries[i].vector); - - for (i = 0; i < assigned_dev->entries_nr; i++) - free_irq(assigned_dev->host_msix_entries[i].vector, - assigned_dev); - - assigned_dev->entries_nr = 0; - kfree(assigned_dev->host_msix_entries); - kfree(assigned_dev->guest_msix_entries); - pci_disable_msix(assigned_dev->dev); - } else { - /* Deal with MSI and INTx */ - if ((assigned_dev->irq_requested_type & - KVM_DEV_IRQ_HOST_INTX) && - (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - pci_intx(assigned_dev->dev, false); - spin_unlock_irq(&assigned_dev->intx_lock); - synchronize_irq(assigned_dev->host_irq); - } else - disable_irq(assigned_dev->host_irq); - - free_irq(assigned_dev->host_irq, assigned_dev); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) - pci_disable_msi(assigned_dev->dev); - } - - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); -} - -static int kvm_deassign_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev, - unsigned long irq_requested_type) -{ - unsigned long guest_irq_type, host_irq_type; - - if (!irqchip_in_kernel(kvm)) - return -EINVAL; - /* no irq assignment to deassign */ - if (!assigned_dev->irq_requested_type) - return -ENXIO; - - host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; - guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; - - if (host_irq_type) - deassign_host_irq(kvm, assigned_dev); - if (guest_irq_type) - deassign_guest_irq(kvm, assigned_dev); - - return 0; -} - -static void kvm_free_assigned_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - kvm_free_assigned_irq(kvm, assigned_dev); - - pci_reset_function(assigned_dev->dev); - if (pci_load_and_free_saved_state(assigned_dev->dev, - &assigned_dev->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&assigned_dev->dev->dev)); - else - pci_restore_state(assigned_dev->dev); - - pci_clear_dev_assigned(assigned_dev->dev); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct kvm_assigned_dev_kernel *assigned_dev, *tmp; - - list_for_each_entry_safe(assigned_dev, tmp, - &kvm->arch.assigned_dev_head, list) { - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int assigned_device_enable_host_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - irq_handler_t irq_handler; - unsigned long flags; - - dev->host_irq = dev->dev->irq; - - /* - * We can only share the IRQ line with other host devices if we are - * able to disable the IRQ source at device-level - independently of - * the guest driver. Otherwise host devices may suffer from unbounded - * IRQ latencies when the guest keeps the line asserted. - */ - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - irq_handler = kvm_assigned_dev_intx; - flags = IRQF_SHARED; - } else { - irq_handler = NULL; - flags = IRQF_ONESHOT; - } - if (request_threaded_irq(dev->host_irq, irq_handler, - kvm_assigned_dev_thread_intx, flags, - dev->irq_name, dev)) - return -EIO; - - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - spin_lock_irq(&dev->intx_lock); - pci_intx(dev->dev, true); - spin_unlock_irq(&dev->intx_lock); - } - return 0; -} - -static int assigned_device_enable_host_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int r; - - if (!dev->dev->msi_enabled) { - r = pci_enable_msi(dev->dev); - if (r) - return r; - } - - dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, - kvm_assigned_dev_thread_msi, 0, - dev->irq_name, dev)) { - pci_disable_msi(dev->dev); - return -EIO; - } - - return 0; -} - -static int assigned_device_enable_host_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int i, r = -EINVAL; - - /* host_msix_entries and guest_msix_entries should have been - * initialized */ - if (dev->entries_nr == 0) - return r; - - r = pci_enable_msix_exact(dev->dev, - dev->host_msix_entries, dev->entries_nr); - if (r) - return r; - - for (i = 0; i < dev->entries_nr; i++) { - r = request_threaded_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_msix, - kvm_assigned_dev_thread_msix, - 0, dev->irq_name, dev); - if (r) - goto err; - } - - return 0; -err: - for (i -= 1; i >= 0; i--) - free_irq(dev->host_msix_entries[i].vector, dev); - pci_disable_msix(dev->dev); - return r; -} - -static int assigned_device_enable_guest_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = irq->guest_irq; - return 0; -} - -static int assigned_device_enable_guest_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assigned_device_enable_guest_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - __u32 host_irq_type) -{ - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) - return r; - - snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", - pci_name(dev->dev)); - - switch (host_irq_type) { - case KVM_DEV_IRQ_HOST_INTX: - r = assigned_device_enable_host_intx(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSI: - r = assigned_device_enable_host_msi(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSIX: - r = assigned_device_enable_host_msix(kvm, dev); - break; - default: - r = -EINVAL; - } - dev->host_irq_disabled = false; - - if (!r) - dev->irq_requested_type |= host_irq_type; - - return r; -} - -static int assign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq, - unsigned long guest_irq_type) -{ - int id; - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) - return r; - - id = kvm_request_irq_source_id(kvm); - if (id < 0) - return id; - - dev->irq_source_id = id; - - switch (guest_irq_type) { - case KVM_DEV_IRQ_GUEST_INTX: - r = assigned_device_enable_guest_intx(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSI: - r = assigned_device_enable_guest_msi(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSIX: - r = assigned_device_enable_guest_msix(kvm, dev, irq); - break; - default: - r = -EINVAL; - } - - if (!r) { - dev->irq_requested_type |= guest_irq_type; - if (dev->ack_notifier.gsi != -1) - kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); - } else { - kvm_free_irq_source_id(kvm, dev->irq_source_id); - dev->irq_source_id = -1; - } - - return r; -} - -/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq *assigned_irq) -{ - int r = -EINVAL; - struct kvm_assigned_dev_kernel *match; - unsigned long host_irq_type, guest_irq_type; - - if (!irqchip_in_kernel(kvm)) - return r; - - mutex_lock(&kvm->lock); - r = -ENODEV; - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); - guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); - - r = -EINVAL; - /* can only assign one type at a time */ - if (hweight_long(host_irq_type) > 1) - goto out; - if (hweight_long(guest_irq_type) > 1) - goto out; - if (host_irq_type == 0 && guest_irq_type == 0) - goto out; - - r = 0; - if (host_irq_type) - r = assign_host_irq(kvm, match, host_irq_type); - if (r) - goto out; - - if (guest_irq_type) - r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = -ENODEV; - struct kvm_assigned_dev_kernel *match; - unsigned long irq_type; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | - KVM_DEV_IRQ_GUEST_MASK); - r = kvm_deassign_irq(kvm, match, irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -/* - * We want to test whether the caller has been granted permissions to - * use this device. To be able to configure and control the device, - * the user needs access to PCI configuration space and BAR resources. - * These are accessed through PCI sysfs. PCI config space is often - * passed to the process calling this ioctl via file descriptor, so we - * can't rely on access to that file. We can check for permissions - * on each of the BAR resource files, which is a pretty clear - * indicator that the user has been granted access to the device. - */ -static int probe_sysfs_permissions(struct pci_dev *dev) -{ -#ifdef CONFIG_SYSFS - int i; - bool bar_found = false; - - for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { - char *kpath, *syspath; - struct path path; - struct inode *inode; - int r; - - if (!pci_resource_len(dev, i)) - continue; - - kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); - if (!kpath) - return -ENOMEM; - - /* Per sysfs-rules, sysfs is always at /sys */ - syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); - kfree(kpath); - if (!syspath) - return -ENOMEM; - - r = kern_path(syspath, LOOKUP_FOLLOW, &path); - kfree(syspath); - if (r) - return r; - - inode = d_backing_inode(path.dentry); - - r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); - path_put(&path); - if (r) - return r; - - bar_found = true; - } - - /* If no resources, probably something special */ - if (!bar_found) - return -EPERM; - - return 0; -#else - return -EINVAL; /* No way to control the device without sysfs */ -#endif -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0, idx; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) - return -EINVAL; - - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EEXIST; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, - assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - - /* Don't allow bridges to be assigned */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { - r = -EPERM; - goto out_put; - } - - r = probe_sysfs_permissions(dev); - if (r) - goto out_put; - - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - - pci_reset_function(dev); - pci_save_state(dev); - match->pci_saved_state = pci_store_saved_state(dev); - if (!match->pci_saved_state) - printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", - __func__, dev_name(&dev->dev)); - - if (!pci_intx_mask_supported(dev)) - assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; - - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_segnr = assigned_dev->segnr; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->flags = assigned_dev->flags; - match->dev = dev; - spin_lock_init(&match->intx_lock); - spin_lock_init(&match->intx_mask_lock); - match->irq_source_id = -1; - match->kvm = kvm; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match->dev); - if (r) - goto out_list_del; - -out: - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -out_list_del: - if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&dev->dev)); - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - printk(KERN_INFO "%s: device hasn't been assigned before, " - "so cannot be deassigned\n", __func__); - r = -EINVAL; - goto out; - } - - kvm_deassign_device(kvm, match->dev); - - kvm_free_assigned_device(kvm, match); - -out: - mutex_unlock(&kvm->lock); - return r; -} - - -static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, - struct kvm_assigned_msix_nr *entry_nr) -{ - int r = 0; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry_nr->assigned_dev_id); - if (!adev) { - r = -EINVAL; - goto msix_nr_out; - } - - if (adev->entries_nr == 0) { - adev->entries_nr = entry_nr->entry_nr; - if (adev->entries_nr == 0 || - adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { - r = -EINVAL; - goto msix_nr_out; - } - - adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * - entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->host_msix_entries) { - r = -ENOMEM; - goto msix_nr_out; - } - adev->guest_msix_entries = - kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->guest_msix_entries) { - kfree(adev->host_msix_entries); - r = -ENOMEM; - goto msix_nr_out; - } - } else /* Not allowed set MSI-X number twice */ - r = -EINVAL; -msix_nr_out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, - struct kvm_assigned_msix_entry *entry) -{ - int r = 0, i; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry->assigned_dev_id); - - if (!adev) { - r = -EINVAL; - goto msix_entry_out; - } - - for (i = 0; i < adev->entries_nr; i++) - if (adev->guest_msix_entries[i].vector == 0 || - adev->guest_msix_entries[i].entry == entry->entry) { - adev->guest_msix_entries[i].entry = entry->entry; - adev->guest_msix_entries[i].vector = entry->gsi; - adev->host_msix_entries[i].entry = entry->entry; - break; - } - if (i == adev->entries_nr) { - r = -ENOSPC; - goto msix_entry_out; - } - -msix_entry_out: - mutex_unlock(&kvm->lock); - - return r; -} - -static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - r = -ENODEV; - goto out; - } - - spin_lock(&match->intx_mask_lock); - - match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; - match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; - - if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { - kvm_set_irq(match->kvm, match->irq_source_id, - match->guest_irq, 0, false); - /* - * Masking at hardware-level is performed on demand, - * i.e. when an IRQ actually arrives at the host. - */ - } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - /* - * Unmask the IRQ line if required. Unmasking at - * device level will be performed by user space. - */ - spin_lock_irq(&match->intx_lock); - if (match->host_irq_disabled) { - enable_irq(match->host_irq); - match->host_irq_disabled = false; - } - spin_unlock_irq(&match->intx_lock); - } - } - - spin_unlock(&match->intx_mask_lock); - -out: - mutex_unlock(&kvm->lock); - return r; -} - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int r; - - switch (ioctl) { - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - r = -EOPNOTSUPP; - break; - } - case KVM_ASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_NR: { - struct kvm_assigned_msix_nr entry_nr; - r = -EFAULT; - if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) - goto out; - r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_ENTRY: { - struct kvm_assigned_msix_entry entry; - r = -EFAULT; - if (copy_from_user(&entry, argp, sizeof entry)) - goto out; - r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_INTX_MASK: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); - break; - } - default: - r = -ENOTTY; - break; - } -out: - return r; -} diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h deleted file mode 100644 index a428c1a211b2..000000000000 --- a/arch/x86/kvm/assigned-dev.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H -#define ARCH_X86_KVM_ASSIGNED_DEV_H - -#include <linux/kvm_host.h> - -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev); -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev); - -int kvm_iommu_map_guest(struct kvm *kvm); -int kvm_iommu_unmap_guest(struct kvm *kvm); - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg); - -void kvm_free_all_assigned_devices(struct kvm *kvm); -#else -static inline int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - return 0; -} - -static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - return -ENOTTY; -} - -static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} -#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */ - -#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index efde6cc50875..59ca2eea522c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -780,18 +780,20 @@ out: static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; + struct kvm_cpuid_entry2 *ej; + int j = i; + int nent = vcpu->arch.cpuid_nent; e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return j; - } - } - return 0; /* silence gcc, even though control never reaches here */ + do { + j = (j + 1) % nent; + ej = &vcpu->arch.cpuid_entries[j]; + } while (ej->function != e->function); + + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + + return j; } /* find an entry with matching function, matching index (if needed), and that @@ -876,6 +878,9 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 eax, ebx, ecx, edx; + if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) + return 1; + eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 35058c2c0eea..da6728383052 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} + static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -205,4 +213,15 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } +static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; +} + +static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.msr_misc_features_enables & + MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 45c7306c8780..fb0055953fbc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += sizeof(_type); \ - _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \ ctxt->fetch.ptr += sizeof(_type); \ _x; \ }) @@ -2547,7 +2547,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) u64 smbase; int ret; - if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); /* @@ -2596,11 +2596,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; } - if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) ctxt->ops->set_nmi_mask(ctxt, false); - ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK; - ctxt->emul_flags &= ~X86EMUL_SMM_MASK; + ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & + ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); return X86EMUL_CONTINUE; } @@ -2742,6 +2742,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); } + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; return X86EMUL_CONTINUE; } @@ -3854,6 +3855,13 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) static int em_cpuid(struct x86_emulate_ctxt *ctxt) { u32 eax, ebx, ecx, edx; + u64 msr = 0; + + ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr); + if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && + ctxt->ops->cpl(ctxt)) { + return emulate_gp(ctxt, 0); + } eax = reg_read(ctxt, VCPU_REGS_RAX); ecx = reg_read(ctxt, VCPU_REGS_RCX); @@ -3934,6 +3942,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) } /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save + * and restore MXCSR. + */ +static size_t __fxstate_size(int nregs) +{ + return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16; +} + +static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt) +{ + bool cr4_osfxsr; + if (ctxt->mode == X86EMUL_MODE_PROT64) + return __fxstate_size(16); + + cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR; + return __fxstate_size(cr4_osfxsr ? 8 : 0); +} + +/* * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, * 1) 16 bit mode * 2) 32 bit mode @@ -3954,7 +3981,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) static int em_fxsave(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; - size_t size; int rc; rc = check_fxsr(ctxt); @@ -3970,68 +3996,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) - size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); - else - size = offsetof(struct fxregs_state, xmm_space[0]); - - return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); -} - -static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, - struct fxregs_state *new) -{ - int rc = X86EMUL_CONTINUE; - struct fxregs_state old; - - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); - if (rc != X86EMUL_CONTINUE) - return rc; - - /* - * 64 bit host will restore XMM 8-15, which is not correct on non-64 - * bit guests. Load the current values in order to preserve 64 bit - * XMMs after fxrstor. - */ -#ifdef CONFIG_X86_64 - /* XXX: accessing XMM 8-15 very awkwardly */ - memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); -#endif - - /* - * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but - * does save and restore MXCSR. - */ - if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) - memcpy(new->xmm_space, old.xmm_space, 8 * 16); - - return rc; + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, + fxstate_size(ctxt)); } static int em_fxrstor(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; int rc; + size_t size; rc = check_fxsr(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); - if (rc != X86EMUL_CONTINUE) - return rc; + ctxt->ops->get_fpu(ctxt); - if (fx_state.mxcsr >> 16) - return emulate_gp(ctxt, 0); + size = fxstate_size(ctxt); + if (size < __fxstate_size(16)) { + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + if (rc != X86EMUL_CONTINUE) + goto out; + } - ctxt->ops->get_fpu(ctxt); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + if (rc != X86EMUL_CONTINUE) + goto out; - if (ctxt->mode < X86EMUL_MODE_PROT64) - rc = fxrstor_fixup(ctxt, &fx_state); + if (fx_state.mxcsr >> 16) { + rc = emulate_gp(ctxt, 0); + goto out; + } if (rc == X86EMUL_CONTINUE) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); +out: ctxt->ops->put_fpu(ctxt); return rc; @@ -4166,7 +4166,7 @@ static int check_dr_write(struct x86_emulate_ctxt *ctxt) static int check_svme(struct x86_emulate_ctxt *ctxt) { - u64 efer; + u64 efer = 0; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); @@ -5316,6 +5316,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; + unsigned emul_flags; ctxt->mem_read.pos = 0; @@ -5330,6 +5331,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + emul_flags = ctxt->ops->get_hflags(ctxt); if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || @@ -5363,7 +5365,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5392,7 +5394,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5446,7 +5448,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 73ea24d4f119..bdcd4139eca9 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -49,7 +49,7 @@ static void pic_unlock(struct kvm_pic *s) __releases(&s->lock) { bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu, *found = NULL; + struct kvm_vcpu *vcpu; int i; s->wakeup_needed = false; @@ -59,16 +59,11 @@ static void pic_unlock(struct kvm_pic *s) if (wakeup) { kvm_for_each_vcpu(i, vcpu, s->kvm) { if (kvm_apic_accept_pic_intr(vcpu)) { - found = vcpu; - break; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + return; } } - - if (!found) - return; - - kvm_make_request(KVM_REQ_EVENT, found); - kvm_vcpu_kick(found); } } @@ -239,7 +234,7 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; - struct kvm_pic *s = pic_irqchip(kvm); + struct kvm_pic *s = kvm->arch.vpic; s->output = 0; @@ -273,7 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm) return intno; } -void kvm_pic_reset(struct kvm_kpic_state *s) +static void kvm_pic_reset(struct kvm_kpic_state *s) { int irq, i; struct kvm_vcpu *vcpu; @@ -422,19 +417,16 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) return ret; } -static u32 pic_ioport_read(void *opaque, u32 addr1) +static u32 pic_ioport_read(void *opaque, u32 addr) { struct kvm_kpic_state *s = opaque; - unsigned int addr; int ret; - addr = addr1; - addr &= 1; if (s->poll) { - ret = pic_poll_read(s, addr1); + ret = pic_poll_read(s, addr); s->poll = 0; } else - if (addr == 0) + if ((addr & 1) == 0) if (s->read_reg_select) ret = s->isr; else @@ -456,76 +448,64 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(gpa_t addr) -{ - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - case 0x4d0: - case 0x4d1: - return 1; - default: - return 0; - } -} - static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { unsigned char data = *(unsigned char *)val; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; if (len != 1) { pr_pic_unimpl("non byte write\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: + pic_lock(s); pic_ioport_write(&s->pics[addr >> 7], addr, data); + pic_unlock(s); break; case 0x4d0: case 0x4d1: + pic_lock(s); elcr_ioport_write(&s->pics[addr & 1], addr, data); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } - pic_unlock(s); return 0; } static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { - unsigned char data = 0; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; + unsigned char *data = (unsigned char *)val; if (len != 1) { memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: - data = pic_ioport_read(&s->pics[addr >> 7], addr); + pic_lock(s); + *data = pic_ioport_read(&s->pics[addr >> 7], addr); + pic_unlock(s); break; case 0x4d0: case 0x4d1: - data = elcr_ioport_read(&s->pics[addr & 1], addr); + pic_lock(s); + *data = elcr_ioport_read(&s->pics[addr & 1], addr); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } - *(unsigned char *)val = data; - pic_unlock(s); return 0; } @@ -576,7 +556,7 @@ static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, */ static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm_pic *s = pic_irqchip(kvm); + struct kvm_pic *s = kvm->arch.vpic; if (!s->output) s->wakeup_needed = true; @@ -657,9 +637,14 @@ void kvm_pic_destroy(struct kvm *kvm) { struct kvm_pic *vpic = kvm->arch.vpic; + if (!vpic) + return; + + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + mutex_unlock(&kvm->slots_lock); kvm->arch.vpic = NULL; kfree(vpic); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 6e219e5c07d2..bdff437acbcb 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -266,11 +266,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) spin_unlock(&ioapic->lock); } -void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - - if (!ioapic) + if (!ioapic_in_kernel(kvm)) return; kvm_make_scan_ioapic_request(kvm); } @@ -315,7 +313,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); - kvm_vcpu_request_scan_ioapic(ioapic->kvm); + kvm_make_scan_ioapic_request(ioapic->kvm); break; } } @@ -624,10 +622,8 @@ int kvm_ioapic_init(struct kvm *kvm) if (ret < 0) { kvm->arch.vioapic = NULL; kfree(ioapic); - return ret; } - kvm_vcpu_request_scan_ioapic(kvm); return ret; } @@ -635,37 +631,36 @@ void kvm_ioapic_destroy(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + if (!ioapic) + return; + cancel_delayed_work_sync(&ioapic->eoi_inject); + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); kvm->arch.vioapic = NULL; kfree(ioapic); } -int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { - struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); - if (!ioapic) - return -EINVAL; + struct kvm_ioapic *ioapic = kvm->arch.vioapic; spin_lock(&ioapic->lock); memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); state->irr &= ~ioapic->irr_delivered; spin_unlock(&ioapic->lock); - return 0; } -int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { - struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); - if (!ioapic) - return -EINVAL; + struct kvm_ioapic *ioapic = kvm->arch.vioapic; spin_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; ioapic->irr_delivered = 0; - kvm_vcpu_request_scan_ioapic(kvm); + kvm_make_scan_ioapic_request(kvm); kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); - return 0; } diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 1cc6e54436db..29ce19732ccf 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -105,17 +105,13 @@ do { \ #define ASSERT(x) do { } while (0) #endif -static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vioapic; -} - static inline int ioapic_in_kernel(struct kvm *kvm) { - int ret; + int mode = kvm->arch.irqchip_mode; - ret = (ioapic_irqchip(kvm) != NULL); - return ret; + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); @@ -132,8 +128,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map); -int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c deleted file mode 100644 index b181426f67b4..000000000000 --- a/arch/x86/kvm/iommu.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Author: Allen M. Kay <allen.m.kay@intel.com> - * Author: Weidong Han <weidong.han@intel.com> - * Author: Ben-Ami Yassour <benami@il.ibm.com> - */ - -#include <linux/list.h> -#include <linux/kvm_host.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/stat.h> -#include <linux/iommu.h> -#include "assigned-dev.h" - -static bool allow_unsafe_assigned_interrupts; -module_param_named(allow_unsafe_assigned_interrupts, - allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, - "Enable device assignment on platforms without interrupt remapping support."); - -static int kvm_iommu_unmap_memslots(struct kvm *kvm); -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages); - -static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long npages) -{ - gfn_t end_gfn; - kvm_pfn_t pfn; - - pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + npages; - gfn += 1; - - if (is_error_noslot_pfn(pfn)) - return pfn; - - while (gfn < end_gfn) - gfn_to_pfn_memslot(slot, gfn++); - - return pfn; -} - -static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long npages) -{ - unsigned long i; - - for (i = 0; i < npages; ++i) - kvm_release_pfn_clean(pfn + i); -} - -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - gfn_t gfn, end_gfn; - kvm_pfn_t pfn; - int r = 0; - struct iommu_domain *domain = kvm->arch.iommu_domain; - int flags; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - gfn = slot->base_gfn; - end_gfn = gfn + slot->npages; - - flags = IOMMU_READ; - if (!(slot->flags & KVM_MEM_READONLY)) - flags |= IOMMU_WRITE; - if (!kvm->arch.iommu_noncoherent) - flags |= IOMMU_CACHE; - - - while (gfn < end_gfn) { - unsigned long page_size; - - /* Check if already mapped */ - if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { - gfn += 1; - continue; - } - - /* Get the page size we could use to map */ - page_size = kvm_host_page_size(kvm, gfn); - - /* Make sure the page_size does not exceed the memslot */ - while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) - page_size >>= 1; - - /* Make sure gfn is aligned to the page size we want to map */ - while ((gfn << PAGE_SHIFT) & (page_size - 1)) - page_size >>= 1; - - /* Make sure hva is aligned to the page size we want to map */ - while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) - page_size >>= 1; - - /* - * Pin all pages we are about to map in memory. This is - * important because we unmap and unpin in 4kb steps later. - */ - pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); - if (is_error_noslot_pfn(pfn)) { - gfn += 1; - continue; - } - - /* Map into IO address space */ - r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - page_size, flags); - if (r) { - printk(KERN_ERR "kvm_iommu_map_address:" - "iommu failed to map pfn=%llx\n", pfn); - kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); - goto unmap_pages; - } - - gfn += page_size >> PAGE_SHIFT; - - cond_resched(); - } - - return 0; - -unmap_pages: - kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); - return r; -} - -static int kvm_iommu_map_memslots(struct kvm *kvm) -{ - int idx, r = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - if (kvm->arch.iommu_noncoherent) - kvm_arch_register_noncoherent_dma(kvm); - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - r = kvm_iommu_map_pages(kvm, memslot); - if (r) - break; - } - srcu_read_unlock(&kvm->srcu, idx); - - return r; -} - -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - int r; - bool noncoherent; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - r = iommu_attach_device(domain, &pdev->dev); - if (r) { - dev_err(&pdev->dev, "kvm assign device failed ret %d", r); - return r; - } - - noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); - - /* Check if need to update IOMMU page table for guest memory */ - if (noncoherent != kvm->arch.iommu_noncoherent) { - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_noncoherent = noncoherent; - r = kvm_iommu_map_memslots(kvm); - if (r) - goto out_unmap; - } - - kvm_arch_start_assignment(kvm); - pci_set_dev_assigned(pdev); - - dev_info(&pdev->dev, "kvm assign device\n"); - - return 0; -out_unmap: - kvm_iommu_unmap_memslots(kvm); - return r; -} - -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - iommu_detach_device(domain, &pdev->dev); - - pci_clear_dev_assigned(pdev); - kvm_arch_end_assignment(kvm); - - dev_info(&pdev->dev, "kvm deassign device\n"); - - return 0; -} - -int kvm_iommu_map_guest(struct kvm *kvm) -{ - int r; - - if (!iommu_present(&pci_bus_type)) { - printk(KERN_ERR "%s: iommu not found\n", __func__); - return -ENODEV; - } - - mutex_lock(&kvm->slots_lock); - - kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); - if (!kvm->arch.iommu_domain) { - r = -ENOMEM; - goto out_unlock; - } - - if (!allow_unsafe_assigned_interrupts && - !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { - printk(KERN_WARNING "%s: No interrupt remapping support," - " disallowing device assignment." - " Re-enable with \"allow_unsafe_assigned_interrupts=1\"" - " module option.\n", __func__); - iommu_domain_free(kvm->arch.iommu_domain); - kvm->arch.iommu_domain = NULL; - r = -EPERM; - goto out_unlock; - } - - r = kvm_iommu_map_memslots(kvm); - if (r) - kvm_iommu_unmap_memslots(kvm); - -out_unlock: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - struct iommu_domain *domain; - gfn_t end_gfn, gfn; - kvm_pfn_t pfn; - u64 phys; - - domain = kvm->arch.iommu_domain; - end_gfn = base_gfn + npages; - gfn = base_gfn; - - /* check if iommu exists and in use */ - if (!domain) - return; - - while (gfn < end_gfn) { - unsigned long unmap_pages; - size_t size; - - /* Get physical address */ - phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - - if (!phys) { - gfn++; - continue; - } - - pfn = phys >> PAGE_SHIFT; - - /* Unmap address from IO address space */ - size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); - unmap_pages = 1ULL << get_order(size); - - /* Unpin all pages we just unmapped to not leak any memory */ - kvm_unpin_pages(kvm, pfn, unmap_pages); - - gfn += unmap_pages; - - cond_resched(); - } -} - -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); -} - -static int kvm_iommu_unmap_memslots(struct kvm *kvm) -{ - int idx; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) - kvm_iommu_unmap_pages(kvm, memslot); - - srcu_read_unlock(&kvm->srcu, idx); - - if (kvm->arch.iommu_noncoherent) - kvm_arch_unregister_noncoherent_dma(kvm); - - return 0; -} - -int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - mutex_lock(&kvm->slots_lock); - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_domain = NULL; - kvm->arch.iommu_noncoherent = false; - mutex_unlock(&kvm->slots_lock); - - iommu_domain_free(domain); - return 0; -} diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 60d91c9d160c..5c24811e8b0b 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -60,7 +60,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) if (irqchip_split(v->kvm)) return pending_userspace_extint(v); else - return pic_irqchip(v->kvm)->output; + return v->kvm->arch.vpic->output; } else return 0; } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 40d5b2cf6061..d5005cc26521 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -78,40 +78,42 @@ void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vpic; -} - static inline int pic_in_kernel(struct kvm *kvm) { - int ret; + int mode = kvm->arch.irqchip_mode; - ret = (pic_irqchip(kvm) != NULL); - return ret; + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } static inline int irqchip_split(struct kvm *kvm) { - return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT; + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_SPLIT; } static inline int irqchip_kernel(struct kvm *kvm) { - return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL; + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } static inline int irqchip_in_kernel(struct kvm *kvm) { - bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE; + int mode = kvm->arch.irqchip_mode; - /* Matches with wmb after initializing kvm->irq_routing. */ + /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); - return ret; + return mode != KVM_IRQCHIP_NONE; } -void kvm_pic_reset(struct kvm_kpic_state *s); - void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6825cd36d13b..3cc3b2d130a0 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -42,7 +42,7 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { - struct kvm_pic *pic = pic_irqchip(kvm); + struct kvm_pic *pic = kvm->arch.vpic; return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -232,11 +232,11 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!ioapic_in_kernel(kvm)) + if (!irqchip_kernel(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); - kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); + kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); unlock: mutex_unlock(&kvm->irq_lock); } @@ -274,42 +274,42 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, srcu_read_unlock(&kvm->irq_srcu, idx); } +bool kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - int r = -EINVAL; - int delta; - unsigned max_pin; - + /* We can't check irqchip_in_kernel() here as some callers are + * currently inititalizing the irqchip. Other callers should therefore + * check kvm_arch_can_set_irq_routing() before calling this function. + */ switch (ue->type) { case KVM_IRQ_ROUTING_IRQCHIP: - delta = 0; + if (irqchip_split(kvm)) + return -EINVAL; + e->irqchip.pin = ue->u.irqchip.pin; switch (ue->u.irqchip.irqchip) { case KVM_IRQCHIP_PIC_SLAVE: - delta = 8; + e->irqchip.pin += PIC_NUM_PINS / 2; /* fall through */ case KVM_IRQCHIP_PIC_MASTER: - if (!pic_in_kernel(kvm)) - goto out; - + if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) + return -EINVAL; e->set = kvm_set_pic_irq; - max_pin = PIC_NUM_PINS; break; case KVM_IRQCHIP_IOAPIC: - if (!ioapic_in_kernel(kvm)) - goto out; - - max_pin = KVM_IOAPIC_NUM_PINS; + if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) + return -EINVAL; e->set = kvm_set_ioapic_irq; break; default: - goto out; + return -EINVAL; } e->irqchip.irqchip = ue->u.irqchip.irqchip; - e->irqchip.pin = ue->u.irqchip.pin + delta; - if (e->irqchip.pin >= max_pin) - goto out; break; case KVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; @@ -318,7 +318,7 @@ int kvm_set_routing_entry(struct kvm *kvm, e->msi.data = ue->u.msi.data; if (kvm_msi_route_invalid(kvm, e)) - goto out; + return -EINVAL; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -326,12 +326,10 @@ int kvm_set_routing_entry(struct kvm *kvm, e->hv_sint.sint = ue->u.hv_sint.sint; break; default: - goto out; + return -EINVAL; } - r = 0; -out: - return r; + return 0; } bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bad6a25067bc..2819d4c123eb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -177,8 +177,8 @@ static void recalculate_apic_map(struct kvm *kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); - new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + new = kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); if (!new) goto out; @@ -529,14 +529,16 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); } static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) { - return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val, - sizeof(*val)); + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); } static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) @@ -1493,31 +1495,65 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + preempt_disable(); kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; + preempt_enable(); } static bool start_hv_timer(struct kvm_lapic *apic) { - u64 tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + int r; - if ((atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) || - kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - } else { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); + if (!kvm_x86_ops->set_hv_timer) + return false; - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) - cancel_hv_timer(apic); + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return false; + + r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); + if (r < 0) + return false; + + ktimer->hv_timer_in_use = true; + hrtimer_cancel(&ktimer->timer); + + /* + * Also recheck ktimer->pending, in case the sw timer triggered in + * the window. For periodic timer, leave the hv timer running for + * simplicity, and the deadline will be recomputed on the next vmexit. + */ + if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) { + if (r) + apic_timer_expired(apic); + return false; } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - return apic->lapic_timer.hv_timer_in_use; + + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true); + return true; +} + +static void start_sw_timer(struct kvm_lapic *apic) +{ + struct kvm_timer *ktimer = &apic->lapic_timer; + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return; + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false); +} + +static void restart_apic_timer(struct kvm_lapic *apic) +{ + if (!start_hv_timer(apic)) + start_sw_timer(apic); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) @@ -1531,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); - if (!start_hv_timer(apic)) - start_sw_period(apic); + restart_apic_timer(apic); } } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(apic->lapic_timer.hv_timer_in_use); - - start_hv_timer(apic); + restart_apic_timer(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1552,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; /* Possibly the TSC deadline timer is not enabled yet */ - if (!apic->lapic_timer.hv_timer_in_use) - return; - - cancel_hv_timer(apic); + if (apic->lapic_timer.hv_timer_in_use) + start_sw_timer(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); - if (atomic_read(&apic->lapic_timer.pending)) - return; +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) - start_sw_period(apic); - else if (apic_lvtt_tscdeadline(apic)) - start_sw_tscdeadline(apic); + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + restart_apic_timer(apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { atomic_set(&apic->lapic_timer.pending, 0); - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - if (set_target_expiration(apic) && - !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_period(apic); - } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_tscdeadline(apic); - } + if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + && !set_target_expiration(apic)) + return; + + restart_apic_timer(apic); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -1809,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu)) - return 0; - - return apic->lapic_timer.tscdeadline; -} - u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1932,7 +1948,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) for (i = 0; i < KVM_APIC_LVT_NUM; i++) kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) + if (kvm_vcpu_is_reset_bsp(vcpu) && + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) kvm_lapic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); @@ -2285,8 +2302,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32))) + if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) return; apic_set_tpr(vcpu->arch.apic, data & 0xff); @@ -2338,14 +2355,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { if (vapic_addr) { - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; @@ -2439,7 +2456,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data, + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, addr, sizeof(u8)); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index bcbe811f3b97..29caa2c3dff9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); @@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac7810513d0e..aafd399cf8c6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -183,13 +183,13 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_present_mask; /* - * The mask/value to distinguish a PTE that has been marked not-present for - * access tracking purposes. - * The mask would be either 0 if access tracking is disabled, or - * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. + * Non-present SPTEs with shadow_acc_track_value set are in place for access + * tracking. */ static u64 __read_mostly shadow_acc_track_mask; static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; @@ -207,16 +207,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { + BUG_ON((mmio_mask & mmio_value) != mmio_value); + shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return !(spte & shadow_acc_track_value); +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + static inline bool is_access_track_spte(u64 spte) { - /* Always false if shadow_acc_track_mask is zero. */ - return (spte & shadow_acc_track_mask) == shadow_acc_track_value; + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } /* @@ -270,7 +294,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; - mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; + mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -278,7 +302,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, static bool is_mmio_spte(u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_mask; + return (spte & shadow_mmio_mask) == shadow_mmio_value; } static gfn_t get_mmio_spte_gfn(u64 spte) @@ -315,12 +339,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, u64 acc_track_mask) { - if (acc_track_mask != 0) - acc_track_mask |= SPTE_SPECIAL_MASK; + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & shadow_acc_track_value); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -329,7 +361,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_x_mask = x_mask; shadow_present_mask = p_mask; shadow_acc_track_mask = acc_track_mask; - WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -549,7 +580,7 @@ static bool spte_has_volatile_bits(u64 spte) is_access_track_spte(spte)) return true; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { if ((spte & shadow_accessed_mask) == 0 || (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) return true; @@ -560,14 +591,17 @@ static bool spte_has_volatile_bits(u64 spte) static bool is_accessed_spte(u64 spte) { - return shadow_accessed_mask ? spte & shadow_accessed_mask - : !is_access_track_spte(spte); + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); } static bool is_dirty_spte(u64 spte) { - return shadow_dirty_mask ? spte & shadow_dirty_mask - : spte & PT_WRITABLE_MASK; + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -707,10 +741,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static u64 mark_spte_for_access_track(u64 spte) { - if (shadow_accessed_mask != 0) + if (spte_ad_enabled(spte)) return spte & ~shadow_accessed_mask; - if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + if (is_access_track_spte(spte)) return spte; /* @@ -729,7 +763,6 @@ static u64 mark_spte_for_access_track(u64 spte) spte |= (spte & shadow_acc_track_saved_bits_mask) << shadow_acc_track_saved_bits_shift; spte &= ~shadow_acc_track_mask; - spte |= shadow_acc_track_value; return spte; } @@ -741,6 +774,7 @@ static u64 restore_acc_track_spte(u64 spte) u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) & shadow_acc_track_saved_bits_mask; + WARN_ON_ONCE(spte_ad_enabled(spte)); WARN_ON_ONCE(!is_access_track_spte(spte)); new_spte &= ~shadow_acc_track_mask; @@ -759,7 +793,7 @@ static bool mmu_spte_age(u64 *sptep) if (!is_accessed_spte(spte)) return false; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } else { @@ -1390,6 +1424,22 @@ static bool spte_clear_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } +static bool wrprot_ad_disabled_spte(u64 *sptep) +{ + bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, + (unsigned long *)sptep); + if (was_writable) + kvm_set_pfn_dirty(spte_to_pfn(*sptep)); + + return was_writable; +} + +/* + * Gets the GFN ready for another round of dirty logging by clearing the + * - D bit on ad-enabled SPTEs, and + * - W bit on ad-disabled SPTEs. + * Returns true iff any D or W bits were cleared. + */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; @@ -1397,7 +1447,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_clear_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_clear_dirty(sptep); + else + flush |= wrprot_ad_disabled_spte(sptep); return flush; } @@ -1420,7 +1473,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_set_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_set_dirty(sptep); return flush; } @@ -1452,7 +1506,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } /** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write + * protect the page if the D-bit isn't supported. * @kvm: kvm instance * @slot: slot to clear D-bit * @gfn_offset: start of the BITS_PER_LONG pages we care about @@ -1498,6 +1553,21 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } +/** + * kvm_arch_write_log_dirty - emulate dirty page logging + * @vcpu: Guest mode vcpu + * + * Emulate arch specific page modification logging for the + * nested hypervisor + */ +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->write_log_dirty) + return kvm_x86_ops->write_log_dirty(vcpu); + + return 0; +} + bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { @@ -1751,18 +1821,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep; struct rmap_iterator iter; - /* - * If there's no access bit in the secondary pte set by the hardware and - * fast access tracking is also not enabled, it's up to gup-fast/gup to - * set the access bit in the primary pte or in the page structure. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) if (is_accessed_spte(*sptep)) return 1; -out: return 0; } @@ -1783,18 +1844,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - /* - * In case of absence of EPT Access and Dirty Bits supports, - * emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - return kvm_handle_hva_range(kvm, start, end, 0, - kvm_unmap_rmapp); - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -2383,7 +2432,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask; + + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + else + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2651,10 +2705,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte = 0; int ret = 0; + struct kvm_mmu_page *sp; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; + sp = page_header(__pa(sptep)); + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, @@ -2663,7 +2722,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, */ spte |= shadow_present_mask; if (!speculative) - spte |= shadow_accessed_mask; + spte |= spte_shadow_accessed_mask(spte); if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; @@ -2720,7 +2779,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_WRITE_MASK) { kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= shadow_dirty_mask; + spte |= spte_shadow_dirty_mask(spte); } if (speculative) @@ -2862,16 +2921,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; + sp = page_header(__pa(sptep)); + /* - * Since it's no accessed bit on EPT, it's no way to - * distinguish between actually accessed translations - * and prefetched, so disable pte prefetch if EPT is - * enabled. + * Without accessed bits, there's no way to distinguish between + * actually accessed translations and prefetched, so disable pte + * prefetch if accessed bits aren't available. */ - if (!shadow_accessed_mask) + if (sp_ad_disabled(sp)) return; - sp = page_header(__pa(sptep)); if (sp->role.level > PT_PAGE_TABLE_LEVEL) return; @@ -3683,12 +3742,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool can_do_async_pf(struct kvm_vcpu *vcpu) +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; + if (is_guest_mode(vcpu)) + return false; + return kvm_x86_ops->interrupt_allowed(vcpu); } @@ -3704,7 +3766,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!async) return false; /* *pfn has correct page already */ - if (!prefault && can_do_async_pf(vcpu)) { + if (!prefault && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(gva, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(gva, gfn); @@ -4272,6 +4334,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->base_role.word = 0; context->base_role.smm = is_smm(vcpu); + context->base_role.ad_disabled = (shadow_accessed_mask == 0); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -4340,7 +4403,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, + bool accessed_dirty) { struct kvm_mmu *context = &vcpu->arch.mmu; @@ -4349,6 +4413,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->nx = true; + context->ept_ad = accessed_dirty; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; @@ -4357,6 +4422,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->root_level = context->shadow_root_level; context->root_hpa = INVALID_PAGE; context->direct_map = false; + context->base_role.ad_disabled = !accessed_dirty; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); @@ -4616,6 +4682,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.smep_andnot_wp = 1; mask.smap_andnot_wp = 1; mask.smm = 1; + mask.ad_disabled = 1; /* * If we don't have indirect shadow pages, it means no page is diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ddc56e91f2e4..a276834950c1 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); @@ -74,7 +74,9 @@ enum { int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, + bool accessed_dirty); +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -201,4 +203,5 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 5a24b846a1cb..8b97a6cba8d1 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -30,8 +30,9 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", __entry->mmu_valid_gen, \ + trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ + " %snxe %sad root %u %s%c", \ + __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ @@ -39,6 +40,7 @@ access_str[role.access], \ role.invalid ? " invalid" : "", \ role.nxe ? "" : "!", \ + role.ad_disabled ? "!" : "", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ saved_ptr; \ diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 37942e419c32..ea67dc876316 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -40,8 +40,8 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - slot->arch.gfn_track[i] = kvm_kvzalloc(npages * - sizeof(*slot->arch.gfn_track[i])); + slot->arch.gfn_track[i] = kvzalloc(npages * + sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL); if (!slot->arch.gfn_track[i]) goto track_free; } @@ -160,6 +160,14 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); } +void kvm_page_track_cleanup(struct kvm *kvm) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + cleanup_srcu_struct(&head->track_srcu); +} + void kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a01105485315..b0454c7e4cff 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,13 +23,6 @@ * so the code in this file is compiled twice, once per pte size. */ -/* - * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro - * uses for EPT without A/D paging type. - */ -extern u64 __pure __using_nonexistent_pte_bit(void) - __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); - #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -39,10 +32,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS - #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK - #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT + #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -60,10 +52,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 - #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK - #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT + #define PT_HAVE_ACCESSED_DIRTY(mmu) true #define CMPXCHG cmpxchg #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 @@ -74,16 +65,18 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS - #define PT_GUEST_ACCESSED_MASK 0 - #define PT_GUEST_DIRTY_MASK 0 - #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() - #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_DIRTY_SHIFT 9 + #define PT_GUEST_ACCESSED_SHIFT 8 + #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif +#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) +#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) + #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) @@ -111,12 +104,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } -static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, + unsigned gpte) { unsigned mask; /* dirty bit is not supported, so no need to track it */ - if (!PT_GUEST_DIRTY_MASK) + if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return; BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); @@ -171,7 +165,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, goto no_present; /* if accessed bit is not supported prefetch non accessed gpte */ - if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; return false; @@ -217,7 +211,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, int ret; /* dirty/accessed bits are not supported, so no need to update them */ - if (!PT_GUEST_DIRTY_MASK) + if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return 0; for (level = walker->max_level; level >= walker->level; --level) { @@ -232,6 +226,10 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (level == walker->level && write_fault && !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); +#if PTTYPE == PTTYPE_EPT + if (kvm_arch_write_log_dirty(vcpu)) + return -EINVAL; +#endif pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) @@ -285,9 +283,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; + u64 pt_access, pte_access; + unsigned index, accessed_dirty, pte_pkey; + unsigned nested_access; gpa_t pte_gpa; + bool have_ad; int offset; + u64 walk_nx_mask = 0; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; @@ -299,8 +301,10 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, retry_walk: walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); + have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 + walk_nx_mask = 1ULL << PT64_NX_SHIFT; if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); @@ -312,15 +316,21 @@ retry_walk: walker->max_level = walker->level; ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); - accessed_dirty = PT_GUEST_ACCESSED_MASK; - pt_access = pte_access = ACC_ALL; + /* + * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging + * by the MOV to CR instruction are treated as reads and do not cause the + * processor to set the dirty flag in any EPT paging-structure entry. + */ + nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; + + pte_access = ~0; ++walker->level; do { gfn_t real_gfn; unsigned long host_addr; - pt_access &= pte_access; + pt_access = pte_access; --walker->level; index = PT_INDEX(addr, walker->level); @@ -332,7 +342,7 @@ retry_walk: walker->pte_gpa[walker->level - 1] = pte_gpa; real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - PFERR_USER_MASK|PFERR_WRITE_MASK, + nested_access, &walker->fault); /* @@ -362,6 +372,12 @@ retry_walk: trace_kvm_mmu_paging_element(pte, walker->level); + /* + * Inverting the NX it lets us AND it like other + * permission bits. + */ + pte_access = pt_access & (pte ^ walk_nx_mask); + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; @@ -370,14 +386,16 @@ retry_walk: goto error; } - accessed_dirty &= pte; - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); - walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); - errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access); + accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ + walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask); + walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask); + errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) goto error; @@ -394,7 +412,7 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - FNAME(protect_clean_gpte)(&pte_access, pte); + FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); else /* * On a write fault, fold the dirty bit into accessed_dirty. @@ -412,10 +430,8 @@ retry_walk: goto retry_walk; } - walker->pt_access = pt_access; - walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, pte_access, pt_access); + __func__, (u64)pte, walker->pte_access, walker->pt_access); return 1; error: @@ -443,7 +459,7 @@ error: */ if (!(errcode & PFERR_RSVD_MASK)) { vcpu->arch.exit_qualification &= 0x187; - vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3; } #endif walker->fault.address = addr; @@ -485,7 +501,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - FNAME(protect_clean_gpte)(&pte_access, gpte); + FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -979,7 +995,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; pte_access &= FNAME(gpte_access)(vcpu, gpte); - FNAME(protect_clean_gpte)(&pte_access, gpte); + FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, &nr_present)) @@ -1025,3 +1041,4 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT #undef PT_GUEST_ACCESSED_SHIFT +#undef PT_HAVE_ACCESSED_DIRTY diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index 9d4a8504a95a..5ab4a364348e 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -294,7 +294,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ((u64)1 << edx.split.bit_width_fixed) - 1; } - pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d1efe2c62b3f..905ea6052517 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/amd-iommu.h> #include <linux/hashtable.h> +#include <linux/frame.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -189,6 +190,7 @@ struct vcpu_svm { struct nested_state nested; bool nmi_singlestep; + u64 nmi_singlestep_guest_rflags; unsigned int3_injected; unsigned long int3_rip; @@ -741,7 +743,6 @@ static int svm_hardware_enable(void) struct svm_cpu_data *sd; uint64_t efer; - struct desc_ptr gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -763,8 +764,7 @@ static int svm_hardware_enable(void) sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; - native_store_gdt(&gdt_descr); - gdt = (struct desc_struct *)gdt_descr.address; + gdt = get_current_gdt_rw(); sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); wrmsrl(MSR_EFER, efer | EFER_SVME); @@ -965,6 +965,18 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +static void disable_nmi_singlestep(struct vcpu_svm *svm) +{ + svm->nmi_singlestep = false; + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { + /* Clear our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; + } +} + /* Note: * This hash table is used to map VM_ID to a struct kvm_arch, * when handling AMD IOMMU GALOG notification to schedule in @@ -1198,10 +1210,13 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_CLGI); set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); set_intercept(svm, INTERCEPT_XSETBV); + if (!kvm_mwait_in_guest()) { + set_intercept(svm, INTERCEPT_MONITOR); + set_intercept(svm, INTERCEPT_MWAIT); + } + control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); control->int_ctl = V_INTR_MASKING_MASK; @@ -1271,7 +1286,8 @@ static void init_vmcb(struct vcpu_svm *svm) } -static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index) +static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, + unsigned int index) { u64 *avic_physical_id_table; struct kvm_arch *vm_data = &vcpu->kvm->arch; @@ -1379,6 +1395,9 @@ static void avic_vm_destroy(struct kvm *kvm) unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; + if (!avic) + return; + avic_free_vm_id(vm_data->avic_vm_id); if (vm_data->avic_logical_id_table_page) @@ -1707,11 +1726,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { - return to_svm(vcpu)->vmcb->save.rflags; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long rflags = svm->vmcb->save.rflags; + + if (svm->nmi_singlestep) { + /* Hide our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + rflags &= ~X86_EFLAGS_RF; + } + return rflags; } static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + if (to_svm(vcpu)->nmi_singlestep) + rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + /* * Any change of EFLAGS.VM is accompanied by a reload of SS * (caused by either a task switch or an inter-privilege IRET), @@ -1802,7 +1834,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, * AMD's VMCB does not have an explicit unusable field, so emulate it * for cross vendor migration purposes by "not present" */ - var->unusable = !var->present || (var->type == 0); + var->unusable = !var->present; switch (seg) { case VCPU_SREG_TR: @@ -1835,6 +1867,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, */ if (var->unusable) var->db = 0; + /* This is symmetric with svm_set_segment() */ var->dpl = to_svm(vcpu)->vmcb->save.cpl; break; } @@ -1975,18 +2008,14 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->base = var->base; s->limit = var->limit; s->selector = var->selector; - if (var->unusable) - s->attrib = 0; - else { - s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); - s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; - s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; - s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; - s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; - s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; - s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; - s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; - } + s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); + s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; + s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; + s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; + s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; + s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; + s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; + s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; /* * This is always accurate, except if SYSRET returned to a segment @@ -1995,7 +2024,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, * would entail passing the CPL to userspace and back. */ if (seg == VCPU_SREG_SS) - svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; + /* This is symmetric with svm_get_segment() */ + svm->vmcb->save.cpl = (var->dpl & 3); mark_dirty(svm->vmcb, VMCB_SEG); } @@ -2108,10 +2138,7 @@ static int db_interception(struct vcpu_svm *svm) } if (svm->nmi_singlestep) { - svm->nmi_singlestep = false; - if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) - svm->vmcb->save.rflags &= - ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + disable_nmi_singlestep(svm); } if (svm->vcpu.guest_debug & @@ -2366,8 +2393,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.efer & EFER_SVME) - || !is_paging(&svm->vcpu)) { + if (!(svm->vcpu.arch.efer & EFER_SVME) || + !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -2377,7 +2404,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) return 1; } - return 0; + return 0; } static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, @@ -2530,6 +2557,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } +/* DB exceptions for our internal use must not cause vmexit */ +static int nested_svm_intercept_db(struct vcpu_svm *svm) +{ + unsigned long dr6; + + /* if we're not singlestepping, it's not ours */ + if (!svm->nmi_singlestep) + return NESTED_EXIT_DONE; + + /* if it's not a singlestep exception, it's not ours */ + if (kvm_get_dr(&svm->vcpu, 6, &dr6)) + return NESTED_EXIT_DONE; + if (!(dr6 & DR6_BS)) + return NESTED_EXIT_DONE; + + /* if the guest is singlestepping, it should get the vmexit */ + if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { + disable_nmi_singlestep(svm); + return NESTED_EXIT_DONE; + } + + /* it's ours, the nested hypervisor must not see this one */ + return NESTED_EXIT_HOST; +} + static int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; @@ -2585,8 +2637,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) - vmexit = NESTED_EXIT_DONE; + if (svm->nested.intercept_exceptions & excp_bits) { + if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) + vmexit = nested_svm_intercept_db(svm); + else + vmexit = NESTED_EXIT_DONE; + } /* async page fault always cause vmexit */ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && svm->apf_reason != 0) @@ -4623,10 +4679,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ + if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0) + return; /* STGI will cause a vm exit */ + + if (svm->nested.exit_required) + return; /* we're not going to run the guest yet */ + /* * Something prevents NMI from been injected. Single step over possible * problem (IRET or exception injection or interrupt shadow) */ + svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } @@ -4767,6 +4830,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->nested.exit_required)) return; + /* + * Disable singlestep if we're injecting an interrupt/exception. + * We don't want our modified rflags to be pushed on the stack where + * we might not be able to easily reset them if we disabled NMI + * singlestep later. + */ + if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { + /* + * Event injection happens before external interrupts cause a + * vmexit and interrupts are disabled here, so smp_send_reschedule + * is enough to force an immediate vmexit. + */ + disable_nmi_singlestep(svm); + smp_send_reschedule(vcpu->cpu); + } + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); @@ -4903,6 +4982,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) mark_all_clean(svm->vmcb); } +STACK_FRAME_NON_STANDARD(svm_vcpu_run); static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { @@ -5253,6 +5333,12 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } +static void svm_setup_mce(struct kvm_vcpu *vcpu) +{ + /* [63:9] are reserved. */ + vcpu->arch.mcg_cap &= 0x1ff; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5364,6 +5450,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, .update_pi_irte = svm_update_pi_irte, + .setup_mce = svm_setup_mce, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 98e82ee1e699..f76efad248ab 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -33,6 +33,7 @@ #include <linux/slab.h> #include <linux/tboot.h> #include <linux/hrtimer.h> +#include <linux/frame.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -48,6 +49,7 @@ #include <asm/kexec.h> #include <asm/apic.h> #include <asm/irq_remapping.h> +#include <asm/mmu_context.h> #include "trace.h" #include "pmu.h" @@ -84,9 +86,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, S_IRUGO); -static bool __read_mostly vmm_exclusive = 1; -module_param(vmm_exclusive, bool, S_IRUGO); - static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); @@ -251,6 +250,7 @@ struct __packed vmcs12 { u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; + u64 pml_address; u64 guest_ia32_debugctl; u64 guest_ia32_pat; u64 guest_ia32_efer; @@ -372,6 +372,7 @@ struct __packed vmcs12 { u16 guest_ldtr_selector; u16 guest_tr_selector; u16 guest_intr_status; + u16 guest_pml_index; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -410,6 +411,7 @@ struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; gpa_t vmxon_ptr; + bool pml_full; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; @@ -596,6 +598,7 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; + unsigned long vmcs_host_cr3; /* May not match real cr3 */ unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { @@ -615,10 +618,6 @@ struct vcpu_vmx { int vpid; bool emulation_required; - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; u32 exit_reason; /* Posted interrupt descriptor */ @@ -749,6 +748,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), FIELD(GUEST_TR_SELECTOR, guest_tr_selector), FIELD(GUEST_INTR_STATUS, guest_intr_status), + FIELD(GUEST_PML_INDEX, guest_pml_index), FIELD(HOST_ES_SELECTOR, host_es_selector), FIELD(HOST_CS_SELECTOR, host_cs_selector), FIELD(HOST_SS_SELECTOR, host_ss_selector), @@ -774,6 +774,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), + FIELD64(PML_ADDRESS, pml_address), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), FIELD64(GUEST_IA32_PAT, guest_ia32_pat), FIELD64(GUEST_IA32_EFER, guest_ia32_efer), @@ -912,10 +913,9 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(unsigned long root_hpa); -static void kvm_cpu_vmxon(u64 addr); -static void kvm_cpu_vmxoff(void); +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); -static DEFINE_PER_CPU(struct desc_ptr, host_gdt); /* * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we @@ -1239,6 +1238,11 @@ static inline bool cpu_has_vmx_invvpid_global(void) return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; } +static inline bool cpu_has_vmx_invvpid(void) +{ + return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; +} + static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -1285,11 +1289,6 @@ static inline bool cpu_has_vmx_invpcid(void) SECONDARY_EXEC_ENABLE_INVPCID; } -static inline bool cpu_has_virtual_nmis(void) -{ - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; -} - static inline bool cpu_has_vmx_wbinvd_exit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -1324,6 +1323,11 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } +static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) +{ + return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low); +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -1358,6 +1362,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) vmx_xsaves_supported(); } +static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); +} + static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -2052,14 +2061,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) */ static unsigned long segment_base(u16 selector) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *table; unsigned long v; if (!(selector & ~SEGMENT_RPL_MASK)) return 0; - table = (struct desc_struct *)gdt->address; + table = get_current_gdt_ro(); if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { u16 ldt_selector = kvm_read_ldt(); @@ -2164,7 +2172,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (vmx->host_state.msr_host_bndcfgs) wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - load_gdt(this_cpu_ptr(&host_gdt)); + load_fixmap_gdt(raw_smp_processor_id()); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -2235,15 +2243,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; - if (!vmm_exclusive) - kvm_cpu_vmxon(phys_addr); - else if (!already_loaded) - loaded_vmcs_clear(vmx->loaded_vmcs); - if (!already_loaded) { + loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); crash_disable_local_vmclear(cpu); @@ -2266,7 +2269,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (!already_loaded) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + void *gdt = get_current_gdt_ro(); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -2277,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ vmcs_writel(HOST_TR_BASE, (unsigned long)this_cpu_ptr(&cpu_tss)); - vmcs_writel(HOST_GDTR_BASE, gdt->address); + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ /* * VM exits change the host TR limit to 0x67 after a VM @@ -2321,11 +2324,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) vmx_vcpu_pi_put(vcpu); __vmx_load_host_state(to_vmx(vcpu)); - if (!vmm_exclusive) { - __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); - vcpu->cpu = -1; - kvm_cpu_vmxoff(); - } } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); @@ -2431,7 +2429,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; - nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); return 1; @@ -2749,11 +2747,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high); vmx->nested.nested_vmx_secondary_ctls_low = 0; vmx->nested.nested_vmx_secondary_ctls_high &= + SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | @@ -2764,14 +2762,19 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | - VMX_EPT_INVEPT_BIT; + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; if (cpu_has_vmx_ept_execute_only()) vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXECUTE_ONLY_BIT; vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | - VMX_EPT_EXTENT_CONTEXT_BIT; + VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_1GB_PAGE_BIT; + if (enable_ept_ad_bits) { + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_PML; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + } } else vmx->nested.nested_vmx_ept_caps = 0; @@ -2781,10 +2784,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * though it is treated as global context. The alternative is * not failing the single-context invvpid, and it is worse. */ - if (enable_vpid) + if (enable_vpid) { + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VPID; vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; - else + } else vmx->nested.nested_vmx_vpid_caps = 0; if (enable_unrestricted_guest) @@ -3194,7 +3199,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -3276,7 +3282,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) + return 1; + if (is_noncanonical_address(data & PAGE_MASK) || + (data & MSR_IA32_BNDCFGS_RSVD)) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -3416,6 +3426,7 @@ static __init int vmx_disabled_by_bios(void) static void kvm_cpu_vmxon(u64 addr) { + cr4_set_bits(X86_CR4_VMXE); intel_pt_handle_vmx(1); asm volatile (ASM_VMX_VMXON_RAX @@ -3458,14 +3469,8 @@ static int hardware_enable(void) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } - cr4_set_bits(X86_CR4_VMXE); - - if (vmm_exclusive) { - kvm_cpu_vmxon(phys_addr); - ept_sync_global(); - } - - native_store_gdt(this_cpu_ptr(&host_gdt)); + kvm_cpu_vmxon(phys_addr); + ept_sync_global(); return 0; } @@ -3489,15 +3494,13 @@ static void kvm_cpu_vmxoff(void) asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); intel_pt_handle_vmx(0); + cr4_clear_bits(X86_CR4_VMXE); } static void hardware_disable(void) { - if (vmm_exclusive) { - vmclear_local_loaded_vmcss(); - kvm_cpu_vmxoff(); - } - cr4_clear_bits(X86_CR4_VMXE); + vmclear_local_loaded_vmcss(); + kvm_cpu_vmxoff(); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -3547,11 +3550,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; + if (!kvm_mwait_in_guest()) + min |= CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING; + opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -3617,9 +3622,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -4011,11 +4016,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) { - vpid_sync_context(vpid); if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); + } else { + vpid_sync_context(vpid); } } @@ -4024,6 +4030,12 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); } +static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) +{ + if (enable_ept) + vmx_flush_tlb(vcpu); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4182,14 +4194,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static u64 construct_eptp(unsigned long root_hpa) +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) { u64 eptp; /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; - if (enable_ept_ad_bits) + if (enable_ept_ad_bits && + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); @@ -4203,7 +4216,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) guest_cr3 = cr3; if (enable_ept) { - eptp = construct_eptp(cr3); + eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); if (is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); @@ -5009,12 +5022,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; - unsigned long cr0, cr4; + unsigned long cr0, cr3, cr4; cr0 = read_cr0(); WARN_ON(cr0 & X86_CR0_TS); vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + /* + * Save the most likely value for this task's CR3 in the VMCS. + * We can't use __get_current_cr3_fast() because we're not atomic. + */ + cr3 = __read_cr3(); + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ + vmx->host_state.vmcs_host_cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); @@ -5157,7 +5177,8 @@ static void ept_set_mmio_spte_mask(void) * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, + VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -5285,8 +5306,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->rmode.vm86_active = 0; - vmx->soft_vnmi_blocked = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vcpu, 0); @@ -5406,8 +5425,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) static void enable_nmi_window(struct kvm_vcpu *vcpu) { - if (!cpu_has_virtual_nmis() || - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } @@ -5448,19 +5466,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (!is_guest_mode(vcpu)) { - if (!cpu_has_virtual_nmis()) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->soft_vnmi_blocked = 1; - vmx->vnmi_blocked_time = 0; - } - ++vcpu->stat.nmi_injections; vmx->nmi_known_unmasked = false; } @@ -5477,8 +5482,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { - if (!cpu_has_virtual_nmis()) - return to_vmx(vcpu)->soft_vnmi_blocked; if (to_vmx(vcpu)->nmi_known_unmasked) return false; return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; @@ -5488,20 +5491,13 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!cpu_has_virtual_nmis()) { - if (vmx->soft_vnmi_blocked != masked) { - vmx->soft_vnmi_blocked = masked; - vmx->vnmi_blocked_time = 0; - } - } else { - vmx->nmi_known_unmasked = !masked; - if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - } + vmx->nmi_known_unmasked = !masked; + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) @@ -5509,9 +5505,6 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) if (to_vmx(vcpu)->nested.nested_run_pending) return 0; - if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - return 0; - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); @@ -6232,23 +6225,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) unsigned long exit_qualification; gpa_t gpa; u32 error_code; - int gla_validity; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - gla_validity = (exit_qualification >> 7) & 0x3; - if (gla_validity == 0x2) { - printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); - printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", - (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - vmcs_readl(GUEST_LINEAR_ADDRESS)); - printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", - (long unsigned int)exit_qualification); - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; - return 0; - } - /* * EPT violation happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. @@ -6256,7 +6235,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * AAK134, BY25. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -6342,7 +6320,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); - if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); @@ -6472,7 +6450,7 @@ void vmx_enable_tdp(void) enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); + VMX_EPT_RWX_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); @@ -6517,8 +6495,10 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); - if (!cpu_has_vmx_vpid()) + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || + !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) @@ -6531,7 +6511,7 @@ static __init int hardware_setup(void) enable_ept_ad_bits = 0; } - if (!cpu_has_vmx_ept_ad_bits()) + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; if (!cpu_has_vmx_unrestricted_guest()) @@ -6574,7 +6554,6 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -6941,97 +6920,21 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, return 0; } -/* - * This function performs the various checks including - * - if it's 4KB aligned - * - No bits beyond the physical address width are set - * - Returns 0 on success or else 1 - * (Intel SDM Section 30.3) - */ -static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, - gpa_t *vmpointer) +static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) { gva_t gva; - gpa_t vmptr; struct x86_exception e; - struct page *page; - struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer, + sizeof(*vmpointer), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } - switch (exit_reason) { - case EXIT_REASON_VMON: - /* - * SDM 3: 24.11.5 - * The first 4 bytes of VMXON region contain the supported - * VMCS revision identifier - * - * Note - IA32_VMX_BASIC[48] will never be 1 - * for the nested case; - * which replaces physical address width with 32 - * - */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - if (*(u32 *)kmap(page) != VMCS12_REVISION) { - kunmap(page); - nested_release_page_clean(page); - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - kunmap(page); - nested_release_page_clean(page); - vmx->nested.vmxon_ptr = vmptr; - break; - case EXIT_REASON_VMCLEAR: - if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { - nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } - - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } - break; - case EXIT_REASON_VMPTRLD: - if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { - nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } - - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } - break; - default: - return 1; /* shouldn't happen */ - } - - if (vmpointer) - *vmpointer = vmptr; return 0; } @@ -7093,34 +6996,26 @@ out_msr_bitmap: static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; - struct kvm_segment cs; + gpa_t vmptr; + struct page *page; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - /* The Intel VMX Instruction Reference lists a bunch of bits that - * are prerequisite to running VMXON, most notably cr4.VMXE must be - * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. We test these now: + /* + * The Intel VMX Instruction Reference lists a bunch of bits that are + * prerequisite to running VMXON, most notably cr4.VMXE must be set to + * 1 (see vmx_set_cr4() for when we allow the guest to set this). + * Otherwise, we should fail with #UD. But most faulting conditions + * have already been checked by hardware, prior to the VM-exit for + * VMXON. We do test guest cr4.VMXE because processor CR4 always has + * that bit set to 1 in non-root mode. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || - !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || - (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if (is_long_mode(vcpu) && !cs.l) { + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); return kvm_skip_emulated_instruction(vcpu); @@ -7132,9 +7027,37 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - + + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; + * which replaces physical address width with 32 + */ + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + + page = nested_get_page(vcpu, vmptr); + if (page == NULL) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + if (*(u32 *)kmap(page) != VMCS12_REVISION) { + kunmap(page); + nested_release_page_clean(page); + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + kunmap(page); + nested_release_page_clean(page); + + vmx->nested.vmxon_ptr = vmptr; ret = enter_vmx_operation(vcpu); if (ret) return ret; @@ -7147,29 +7070,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are * slightly different). It also specifies what exception to inject otherwise. + * Note that many of these exceptions have priority over VM exits, so they + * don't have to be checked again here. */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { - struct kvm_segment cs; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.vmxon) { + if (!to_vmx(vcpu)->nested.vmxon) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || - (is_long_mode(vcpu) && !cs.l)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 0; - } - return 1; } @@ -7264,9 +7173,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) + if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { + nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); + return kvm_skip_emulated_instruction(vcpu); + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); + return kvm_skip_emulated_instruction(vcpu); + } + if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); @@ -7513,7 +7432,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, true, &gva)) return 1; - /* _system ok, as nested_vmx_check_permission verified cpl=0 */ + /* _system ok, as hardware has verified cpl=0 */ kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); } @@ -7596,9 +7515,19 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) + if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { + nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); + return kvm_skip_emulated_instruction(vcpu); + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); + return kvm_skip_emulated_instruction(vcpu); + } + if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; struct page *page; @@ -7646,7 +7575,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, true, &vmcs_gva)) return 1; - /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ + /* ok to use *_system, as hardware has verified cpl=0 */ if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, (void *)&to_vmx(vcpu)->nested.current_vmptr, sizeof(u64), &e)) { @@ -7679,11 +7608,6 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); @@ -7733,7 +7657,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) unsigned long type, types; gva_t gva; struct x86_exception e; - int vpid; + struct { + u64 vpid; + u64 gla; + } operand; if (!(vmx->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -7763,17 +7690,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, - sizeof(u32), &e)) { + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } + if (operand.vpid >> 16) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + if (is_noncanonical_address(operand.gla)) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + /* fall through */ case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!vpid) { + if (!operand.vpid) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); return kvm_skip_emulated_instruction(vcpu); @@ -7805,7 +7743,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) * "blocked by NMI" bit has to be set before next VM entry. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -7970,11 +7907,13 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); int cr = exit_qualification & 15; - int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_readl(vcpu, reg); + int reg; + unsigned long val; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ + reg = (exit_qualification >> 8) & 15; + val = kvm_register_readl(vcpu, reg); switch (cr) { case 0: if (vmcs12->cr0_guest_host_mask & @@ -8029,6 +7968,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, * lmsw can change bits 1..3 of cr0, and only set bit 0 of * cr0. Other attempted changes are ignored, with no exit. */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; if (vmcs12->cr0_guest_host_mask & 0xe & (val ^ vmcs12->cr0_read_shadow)) return true; @@ -8107,6 +8047,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + case EXIT_REASON_RDRAND: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND); + case EXIT_REASON_RDSEED: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: @@ -8184,6 +8128,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); case EXIT_REASON_PREEMPTION_TIMER: return false; + case EXIT_REASON_PML_FULL: + /* We emulate PML support to L1. */ + return false; default: return true; } @@ -8477,31 +8424,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && - !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu))))) { - if (vmx_interrupt_allowed(vcpu)) { - vmx->soft_vnmi_blocked = 0; - } else if (vmx->vnmi_blocked_time > 1000000000LL && - vcpu->arch.nmi_pending) { - /* - * This CPU don't support us in finding the end of an - * NMI-blocked window if the guest runs with IRQs - * disabled. So we pull the trigger after 1 s of - * futile waiting, but inform the user about this. - */ - printk(KERN_WARNING "%s: Breaking out of NMI-blocked " - "state on VCPU %d after 1 s timeout\n", - __func__, vcpu->vcpu_id); - vmx->soft_vnmi_blocked = 0; - } - } - if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", + exit_reason); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -8547,6 +8475,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) } else { sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmx_flush_tlb_ept_only(vcpu); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); @@ -8572,8 +8501,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) */ if (!is_guest_mode(vcpu) || !nested_cpu_has2(get_vmcs12(&vmx->vcpu), - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmcs_write64(APIC_ACCESS_ADDR, hpa); + vmx_flush_tlb_ept_only(vcpu); + } } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) @@ -8741,6 +8672,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) ); } } +STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); static bool vmx_has_high_real_mode_segbase(void) { @@ -8768,37 +8700,33 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (cpu_has_virtual_nmis()) { - if (vmx->nmi_known_unmasked) - return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - * SDM 3: 23.2.2 (September 2008) - * Bit 12 is undefined in any of the following cases: - * If the VM exit sets the valid bit in the IDT-vectoring - * information field. - * If the VM exit is due to a double fault. - */ - if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && - vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) - & GUEST_INTR_STATE_NMI); - } else if (unlikely(vmx->soft_vnmi_blocked)) - vmx->vnmi_blocked_time += - ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); + if (vmx->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); } static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, @@ -8913,11 +8841,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr4; - - /* Record the guest's net vcpu time for enforced NMI injections. */ - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) - vmx->entry_time = ktime_get(); + unsigned long debugctlmsr, cr3, cr4; /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ @@ -8939,6 +8863,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->host_state.vmcs_host_cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); @@ -9125,17 +9055,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } +STACK_FRAME_NON_STANDARD(vmx_vcpu_run); -static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; - if (vmx->loaded_vmcs == &vmx->vmcs01) + if (vmx->loaded_vmcs == vmcs) return; cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; + vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); vmx_vcpu_load(vcpu, cpu); vcpu->cpu = cpu; @@ -9153,7 +9084,7 @@ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); BUG_ON(r); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); free_nested(vmx); vcpu_put(vcpu); } @@ -9214,11 +9145,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->loaded_vmcs->shadow_vmcs = NULL; if (!vmx->loaded_vmcs->vmcs) goto free_msrs; - if (!vmm_exclusive) - kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); loaded_vmcs_init(vmx->loaded_vmcs); - if (!vmm_exclusive) - kvm_cpu_vmxoff(); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); @@ -9460,16 +9387,28 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason; + unsigned long exit_qualification = vcpu->arch.exit_qualification; - if (fault->error_code & PFERR_RSVD_MASK) + if (vmx->nested.pml_full) { + exit_reason = EXIT_REASON_PML_FULL; + vmx->nested.pml_full = false; + exit_qualification &= INTR_INFO_UNBLOCK_NMI; + } else if (fault->error_code & PFERR_RSVD_MASK) exit_reason = EXIT_REASON_EPT_MISCONFIG; else exit_reason = EXIT_REASON_EPT_VIOLATION; - nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); + + nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) +{ + return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT; +} + /* Callbacks for nested_ept_init_mmu_context: */ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) @@ -9478,17 +9417,26 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) return get_vmcs12(vcpu)->ept_pointer; } -static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { + bool wants_ad; + WARN_ON(mmu_is_nested(vcpu)); + wants_ad = nested_ept_ad_enabled(vcpu); + if (wants_ad && !enable_ept_ad_bits) + return 1; + + kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.nested_vmx_ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT); + VMX_EPT_EXECUTE_ONLY_BIT, + wants_ad); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + return 0; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -9800,6 +9748,22 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u64 address = vmcs12->pml_address; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) { + if (!nested_cpu_has_ept(vmcs12) || + !IS_ALIGNED(address, 4096) || + address >> maxphyaddr) + return -EINVAL; + } + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -9973,8 +9937,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool from_vmentry, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; - bool nested_ept_enabled = false; + u32 exec_control, vmcs12_exec_ctrl; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -10105,8 +10068,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) - exec_control |= vmcs12->secondary_vm_exec_control; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { + vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & + ~SECONDARY_EXEC_ENABLE_PML; + exec_control |= vmcs12_exec_ctrl; + } if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { vmcs_write64(EOI_EXIT_BITMAP0, @@ -10121,8 +10087,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_intr_status); } - nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; - /* * Write an illegal value to APIC_ACCESS_ADDR. Later, * nested_get_vmcs12_pages will either fix it up or @@ -10252,9 +10216,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } + if (enable_pml) { + /* + * Conceptually we want to copy the PML address and index from + * vmcs01 here, and then back to vmcs01 on nested vmexit. But, + * since we always flush the log on each vmexit, this happens + * to be equivalent to simply resetting the fields in vmcs02. + */ + ASSERT(vmx->pml_pg); + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } + if (nested_cpu_has_ept(vmcs12)) { - kvm_mmu_unload(vcpu); - nested_ept_init_mmu_context(vcpu); + if (nested_ept_init_mmu_context(vcpu)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + } else if (nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb_ept_only(vcpu); } /* @@ -10282,12 +10263,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_set_efer(vcpu, vcpu->arch.efer); /* Shadow page tables on either EPT or shadow page tables. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled, + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) return 1; - kvm_mmu_reset_context(vcpu); - if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; @@ -10323,12 +10302,16 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high) || + (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + !vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high)) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high) || @@ -10340,6 +10323,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.nested_vmx_entry_ctls_high)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || !nested_cr3_valid(vcpu, vmcs12->host_cr3)) @@ -10407,7 +10393,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct loaded_vmcs *vmcs02; - int cpu; u32 msr_entry_idx; u32 exit_qual; @@ -10420,18 +10405,12 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - cpu = get_cpu(); - vmx->loaded_vmcs = vmcs02; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - + vmx_switch_vmcs(vcpu, vmcs02); vmx_segment_cache_clear(vmx); if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, exit_qual); return 1; @@ -10444,7 +10423,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) vmcs12->vm_entry_msr_load_count); if (msr_entry_idx) { leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); return 1; @@ -10764,8 +10743,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } - if (nested_cpu_has_ept(vmcs12)) - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); @@ -10790,8 +10768,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); - if (nested_cpu_has_xsaves(vmcs12)) - vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); } /* @@ -11012,7 +10988,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (unlikely(vmx->fail)) vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) && nested_exit_intr_ack_set(vcpu)) { @@ -11056,6 +11032,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx->nested.change_vmcs01_virtual_x2apic_mode = false; vmx_set_virtual_x2apic_mode(vcpu, vcpu->arch.apic_base & X2APIC_ENABLE); + } else if (!nested_cpu_has_ept(vmcs12) && + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb_ept_only(vcpu); } /* This is needed for same reason as it was needed in prepare_vmcs02 */ @@ -11184,7 +11164,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) vmx->hv_deadline_tsc = tscl + delta_tsc; vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); - return 0; + + return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) @@ -11220,6 +11201,46 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t gpa; + struct page *page = NULL; + u64 *pml_address; + + if (is_guest_mode(vcpu)) { + WARN_ON_ONCE(vmx->nested.pml_full); + + /* + * Check if PML is enabled for the nested guest. + * Whether eptp bit 6 is set is already checked + * as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + + page = nested_get_page(vcpu, vmcs12->pml_address); + if (!page) + return 0; + + pml_address = kmap(page); + pml_address[vmcs12->guest_pml_index--] = gpa; + kunmap(page); + nested_release_page_clean(page); + } + + return 0; +} + static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t offset, unsigned long mask) @@ -11579,6 +11600,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .write_log_dirty = vmx_write_pml_buffer, .pre_block = vmx_pre_block, .post_block = vmx_post_block, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1faf620a6fdc..6c7266f7766d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -27,7 +27,6 @@ #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" -#include "assigned-dev.h" #include "pmu.h" #include "hyperv.h" @@ -1008,6 +1007,8 @@ static u32 emulated_msrs[] = { MSR_IA32_MCG_CTL, MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, + MSR_PLATFORM_INFO, + MSR_MISC_FEATURES_ENABLES, }; static unsigned num_emulated_msrs; @@ -1444,10 +1445,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 usdiff; bool matched; bool already_matched; u64 data = msr->data; + bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); @@ -1455,51 +1456,34 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - int faulted = 0; - - /* n.b - signed multiplication and division required */ - usdiff = data - kvm->arch.last_tsc_write; -#ifdef CONFIG_X86_64 - usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; -#else - /* do_div() only does unsigned */ - asm("1: idivl %[divisor]\n" - "2: xor %%edx, %%edx\n" - " movl $0, %[faulted]\n" - "3:\n" - ".section .fixup,\"ax\"\n" - "4: movl $1, %[faulted]\n" - " jmp 3b\n" - ".previous\n" - - _ASM_EXTABLE(1b, 4b) - - : "=A"(usdiff), [faulted] "=r" (faulted) - : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); - -#endif - do_div(elapsed, 1000); - usdiff -= elapsed; - if (usdiff < 0) - usdiff = -usdiff; - - /* idivl overflow => difference is larger than USEC_PER_SEC */ - if (faulted) - usdiff = USEC_PER_SEC; - } else - usdiff = USEC_PER_SEC; /* disable TSC match window below */ + if (data == 0 && msr->host_initiated) { + /* + * detection of vcpu initialization -- need to sync + * with other vCPUs. This particularly helps to keep + * kvm_clock stable after CPU hotplug + */ + synchronizing = true; + } else { + u64 tsc_exp = kvm->arch.last_tsc_write + + nsec_to_cycles(vcpu, elapsed); + u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; + /* + * Special case: TSC write with a small delta (1 second) + * of virtual cycle time against real time is + * interpreted as an attempt to synchronize the CPU. + */ + synchronizing = data < tsc_exp + tsc_hz && + data + tsc_hz > tsc_exp; + } + } /* - * Special case: TSC write with a small delta (1 second) of virtual - * cycle time against real time is interpreted as an attempt to - * synchronize the CPU. - * * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the * compensation code attempt to catch up if we fall behind, but * it's better to try to match offsets from the beginning. */ - if (usdiff < USEC_PER_SEC && + if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; @@ -1769,16 +1753,17 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } -static u64 __get_kvmclock_ns(struct kvm *kvm) +u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; + u64 ret; spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { @@ -1790,22 +1775,17 @@ static u64 __get_kvmclock_ns(struct kvm *kvm) hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; spin_unlock(&ka->pvclock_gtod_sync_lock); + /* both __this_cpu_read() and rdtsc() should be on the same cpu */ + get_cpu(); + kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - return __pvclock_read_cycles(&hv_clock, rdtsc()); -} + ret = __pvclock_read_cycles(&hv_clock, rdtsc()); -u64 get_kvmclock_ns(struct kvm *kvm) -{ - unsigned long flags; - s64 ns; - - local_irq_save(flags); - ns = __get_kvmclock_ns(kvm); - local_irq_restore(flags); + put_cpu(); - return ns; + return ret; } static void kvm_setup_pvclock_page(struct kvm_vcpu *v) @@ -1813,7 +1793,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time, + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, &guest_hv_clock, sizeof(guest_hv_clock)))) return; @@ -1834,9 +1814,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -1850,16 +1830,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2092,7 +2072,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, sizeof(u32))) return 1; @@ -2111,7 +2091,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime, + if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; @@ -2122,7 +2102,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); @@ -2131,14 +2111,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } @@ -2155,6 +2135,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_VM_HSAVE_PA: case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: + case MSR_AMD64_DC_CFG: break; case MSR_EFER: @@ -2230,8 +2211,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) bool tmp = (msr == MSR_KVM_SYSTEM_TIME); if (ka->boot_vcpu_runs_old_kvmclock != tmp) - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, - &vcpu->requests); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); ka->boot_vcpu_runs_old_kvmclock = tmp; } @@ -2243,7 +2223,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; @@ -2264,7 +2244,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, data & KVM_STEAL_VALID_BITS, sizeof(struct kvm_steal_time))) return 1; @@ -2331,6 +2311,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.osvw.status = data; break; + case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated || + data & ~MSR_PLATFORM_INFO_CPUID_FAULT || + (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && + cpuid_fault_enabled(vcpu))) + return 1; + vcpu->arch.msr_platform_info = data; + break; + case MSR_MISC_FEATURES_ENABLES: + if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || + (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && + !supports_cpuid_fault(vcpu))) + return 1; + vcpu->arch.msr_misc_features_enables = data; + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -2417,6 +2412,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: + case MSR_AMD64_DC_CFG: msr_info->data = 0; break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: @@ -2545,6 +2541,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.osvw.status; break; + case MSR_PLATFORM_INFO: + msr_info->data = vcpu->arch.msr_platform_info; + break; + case MSR_MISC_FEATURES_ENABLES: + msr_info->data = vcpu->arch.msr_misc_features_enables; + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -2675,15 +2677,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_ASSIGN_DEV_IRQ: - case KVM_CAP_PCI_2_3: -#endif r = 1; break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_TSC_STABLE; break; + case KVM_CAP_X86_GUEST_MWAIT: + r = kvm_mwait_in_guest(); + break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, @@ -2695,9 +2696,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); break; - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; case KVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; @@ -2713,11 +2711,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_IOMMU: - r = iommu_present(&pci_bus_type); - break; -#endif case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; @@ -2816,11 +2809,6 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } -static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ @@ -2853,10 +2841,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_target_expiration_tsc(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_restart_hv_timer(vcpu); + /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration @@ -2864,7 +2852,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) - kvm_migrate_timers(vcpu); + kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); vcpu->cpu = cpu; } @@ -2878,7 +2866,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.preempted = 1; - kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal.preempted, offsetof(struct kvm_steal_time, preempted), sizeof(vcpu->arch.st.steal.preempted)); @@ -3124,7 +3112,14 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return -EINVAL; if (events->exception.injected && - (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) + (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR || + is_guest_mode(vcpu))) + return -EINVAL; + + /* INITs are latched while in SMM */ + if (events->flags & KVM_VCPUEVENT_VALID_SMM && + (events->smi.smm || events->smi.pending) && + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) return -EINVAL; process_nmi(vcpu); @@ -3301,11 +3296,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, } } +#define XSAVE_MXCSR_OFFSET 24 + static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* @@ -3313,11 +3311,13 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~kvm_supported_xcr0()) + if (xstate_bv & ~kvm_supported_xcr0() || + mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE) + if (xstate_bv & ~XFEATURE_MASK_FPSSE || + mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); @@ -3721,22 +3721,21 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { + struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[0], + memcpy(&chip->chip.pic, &pic->pics[0], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[1], + memcpy(&chip->chip.pic, &pic->pics[1], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: - r = kvm_get_ioapic(kvm, &chip->chip.ioapic); + kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -3747,32 +3746,31 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { + struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[0], - &chip->chip.pic, + spin_lock(&pic->lock); + memcpy(&pic->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[1], - &chip->chip.pic, + spin_lock(&pic->lock); + memcpy(&pic->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic->lock); break; case KVM_IRQCHIP_IOAPIC: - r = kvm_set_ioapic(kvm, &chip->chip.ioapic); + kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; break; } - kvm_pic_update_irq(pic_irqchip(kvm)); + kvm_pic_update_irq(pic); return r; } @@ -4018,20 +4016,14 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_ioapic_init(kvm); if (r) { - mutex_lock(&kvm->slots_lock); kvm_pic_destroy(kvm); - mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } r = kvm_setup_default_irq_routing(kvm); if (r) { - mutex_lock(&kvm->slots_lock); - mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); - mutex_unlock(&kvm->irq_lock); - mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ @@ -4196,10 +4188,8 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); + now_ns = get_kvmclock_ns(kvm); kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - local_irq_enable(); kvm_gen_update_masterclock(kvm); break; } @@ -4207,11 +4197,9 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_clock_data user_ns; u64 now_ns; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); + now_ns = get_kvmclock_ns(kvm); user_ns.clock = now_ns; user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - local_irq_enable(); memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; @@ -4230,7 +4218,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } default: - r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); + r = -ENOTTY; } out: return r; @@ -4843,16 +4831,20 @@ emul_write: static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) { - /* TODO: String I/O for in kernel device */ - int r; + int r = 0, i; - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); + for (i = 0; i < vcpu->arch.pio.count; i++) { + if (vcpu->arch.pio.in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + else + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); + if (r) + break; + pd += vcpu->arch.pio.size; + } return r; } @@ -4890,6 +4882,8 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (vcpu->arch.pio.count) goto data_avail; + memset(vcpu->arch.pio_data, 0, size * count); + ret = emulator_pio_in_out(vcpu, size, port, val, count, true); if (ret) { data_avail: @@ -5073,6 +5067,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, if (var.unusable) { memset(desc, 0, sizeof(*desc)); + if (base3) + *base3 = 0; return false; } @@ -5223,6 +5219,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); } +static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +{ + return emul_to_vcpu(ctxt)->arch.hflags; +} + +static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) +{ + kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5262,6 +5268,8 @@ static const struct x86_emulate_ops emulate_ops = { .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, .set_nmi_mask = emulator_set_nmi_mask, + .get_hflags = emulator_get_hflags, + .set_hflags = emulator_set_hflags, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -5305,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; + ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : @@ -5314,7 +5324,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); - ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; @@ -5521,36 +5530,25 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) { struct kvm_run *kvm_run = vcpu->run; - /* - * rflags is the old, "raw" value of the flags. The new value has - * not been saved yet. - * - * This is correct even for TF set by the guest, because "the - * processor will not generate this exception after the instruction - * that sets the TF flag". - */ - if (unlikely(rflags & X86_EFLAGS_TF)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | - DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - /* - * "Certain debug exceptions may clear bit 0-3. The - * remaining contents of the DR6 register are never - * cleared by the processor". - */ - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; + kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5560,7 +5558,17 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) int r = EMULATE_DONE; kvm_x86_ops->skip_emulated_instruction(vcpu); - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + + /* + * rflags is the old, "raw" value of the flags. The new value has + * not been saved yet. + * + * This is correct even for TF set by the guest, because "the + * processor will not generate this exception after the instruction + * that sets the TF flag". + */ + if (unlikely(rflags & X86_EFLAGS_TF)) + kvm_vcpu_do_singlestep(vcpu, &r); return r == EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); @@ -5718,11 +5726,10 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - if (vcpu->arch.hflags != ctxt->emul_flags) - kvm_set_hflags(vcpu, ctxt->emul_flags); kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + if (r == EMULATE_DONE && + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) __kvm_set_rflags(vcpu, ctxt->eflags); @@ -6004,7 +6011,7 @@ static void kvm_set_mmio_spte_mask(void) mask &= ~1ull; #endif - kvm_mmu_set_mmio_spte_mask(mask); + kvm_mmu_set_mmio_spte_mask(mask, mask); } #ifdef CONFIG_X86_64 @@ -6726,7 +6733,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -6869,7 +6876,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * 1) We should set ->mode before checking ->requests. Please see - * the comment in kvm_make_all_cpus_request. + * the comment in kvm_vcpu_exiting_guest_mode(). * * 2) For APICv, we should set ->mode before checking PIR.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on @@ -6890,7 +6897,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->sync_pir_to_irr(vcpu); } - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -7051,7 +7058,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -7179,7 +7186,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; goto out; } @@ -7355,6 +7362,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state != KVM_MP_STATE_RUNNABLE) return -EINVAL; + /* INITs are latched while in SMM */ + if ((is_smm(vcpu) || vcpu->arch.smi_pending) && + (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || + mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) + return -EINVAL; + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); @@ -7724,6 +7737,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (!init_event) { kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; + + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + vcpu->arch.msr_misc_features_enables = 0; } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); @@ -8068,7 +8084,6 @@ void kvm_arch_sync_events(struct kvm *kvm) { cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } @@ -8152,12 +8167,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); - kvm_iommu_unmap_guest(kvm); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); + kvm_pic_destroy(kvm); + kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); + kvm_page_track_cleanup(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, @@ -8198,13 +8213,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->base_gfn, level) + 1; slot->arch.rmap[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); + kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); + linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL); if (!linfo) goto out_free; @@ -8381,10 +8396,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (vcpu->arch.pv.pv_unhalted) return true; - if (atomic_read(&vcpu->arch.nmi_queued)) + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + (vcpu->arch.nmi_pending && + kvm_x86_ops->nmi_allowed(vcpu))) return true; - if (test_bit(KVM_REQ_SMI, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_SMI, vcpu) || + (vcpu->arch.smi_pending && !is_smm(vcpu))) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -8535,8 +8553,9 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, + sizeof(val)); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, @@ -8566,11 +8585,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, { struct x86_exception fault; - trace_kvm_async_pf_ready(work->arch.token, work->gva); if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); + trace_kvm_async_pf_ready(work->arch.token, work->gva); if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { @@ -8590,8 +8609,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) return true; else - return !kvm_event_needs_reinjection(vcpu) && - kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_can_do_async_pf(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e8ff3e4ce38a..612067074905 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -1,6 +1,8 @@ #ifndef ARCH_X86_KVM_X86_H #define ARCH_X86_KVM_X86_H +#include <asm/processor.h> +#include <asm/mwait.h> #include <linux/kvm_host.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" @@ -212,4 +214,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) __rem; \ }) +static inline bool kvm_mwait_in_guest(void) +{ + unsigned int eax, ebx, ecx, edx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) + return false; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* All AMD CPUs have a working MWAIT implementation */ + return true; + case X86_VENDOR_INTEL: + /* Handle Intel below */ + break; + default: + return false; + } + + /* + * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as + * they would allow guest to stop the CPU completely by disabling + * interrupts then invoking MWAIT. + */ + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) + return false; + + return true; +} + #endif |