summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig12
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/assigned-dev.c1058
-rw-r--r--arch/x86/kvm/assigned-dev.h32
-rw-r--r--arch/x86/kvm/cpuid.c23
-rw-r--r--arch/x86/kvm/cpuid.h19
-rw-r--r--arch/x86/kvm/emulate.c110
-rw-r--r--arch/x86/kvm/i8259.c75
-rw-r--r--arch/x86/kvm/ioapic.c31
-rw-r--r--arch/x86/kvm/ioapic.h16
-rw-r--r--arch/x86/kvm/iommu.c356
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h32
-rw-r--r--arch/x86/kvm/irq_comm.c50
-rw-r--r--arch/x86/kvm/lapic.c147
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c181
-rw-r--r--arch/x86/kvm/mmu.h7
-rw-r--r--arch/x86/kvm/mmutrace.h6
-rw-r--r--arch/x86/kvm/page_track.c12
-rw-r--r--arch/x86/kvm/paging_tmpl.h87
-rw-r--r--arch/x86/kvm/pmu_intel.c2
-rw-r--r--arch/x86/kvm/svm.c147
-rw-r--r--arch/x86/kvm/vmx.c734
-rw-r--r--arch/x86/kvm/x86.c392
-rw-r--r--arch/x86/kvm/x86.h36
26 files changed, 1195 insertions, 2376 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ab8e32f7b9a8..760433b2574a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -86,18 +86,6 @@ config KVM_MMU_AUDIT
This option adds a R/W kVM module parameter 'mmu_audit', which allows
auditing of KVM MMU events at runtime.
-config KVM_DEVICE_ASSIGNMENT
- bool "KVM legacy PCI device assignment support (DEPRECATED)"
- depends on KVM && PCI && IOMMU_API
- default n
- ---help---
- Provide support for legacy PCI device assignment through KVM. The
- kernel now also supports a full featured userspace device driver
- framework through VFIO, which supersedes this support and provides
- better security.
-
- If unsure, say N.
-
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 3bff20710471..09d4b17be022 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -15,8 +15,6 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
hyperv.o page_track.o debugfs.o
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
-
kvm-intel-y += vmx.o pmu_intel.o
kvm-amd-y += svm.o pmu_amd.o
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
deleted file mode 100644
index 308b8597c691..000000000000
--- a/arch/x86/kvm/assigned-dev.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * Kernel-based Virtual Machine - device assignment support
- *
- * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/namei.h>
-#include <linux/fs.h>
-#include "irq.h"
-#include "assigned-dev.h"
-#include "trace/events/kvm.h"
-
-struct kvm_assigned_dev_kernel {
- struct kvm_irq_ack_notifier ack_notifier;
- struct list_head list;
- int assigned_dev_id;
- int host_segnr;
- int host_busnr;
- int host_devfn;
- unsigned int entries_nr;
- int host_irq;
- bool host_irq_disabled;
- bool pci_2_3;
- struct msix_entry *host_msix_entries;
- int guest_irq;
- struct msix_entry *guest_msix_entries;
- unsigned long irq_requested_type;
- int irq_source_id;
- int flags;
- struct pci_dev *dev;
- struct kvm *kvm;
- spinlock_t intx_lock;
- spinlock_t intx_mask_lock;
- char irq_name[32];
- struct pci_saved_state *pci_saved_state;
-};
-
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
- int assigned_dev_id)
-{
- struct kvm_assigned_dev_kernel *match;
-
- list_for_each_entry(match, head, list) {
- if (match->assigned_dev_id == assigned_dev_id)
- return match;
- }
- return NULL;
-}
-
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
- *assigned_dev, int irq)
-{
- int i, index;
- struct msix_entry *host_msix_entries;
-
- host_msix_entries = assigned_dev->host_msix_entries;
-
- index = -1;
- for (i = 0; i < assigned_dev->entries_nr; i++)
- if (irq == host_msix_entries[i].vector) {
- index = i;
- break;
- }
- if (index < 0)
- printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-
- return index;
-}
-
-static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
-{
- struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
- int ret;
-
- spin_lock(&assigned_dev->intx_lock);
- if (pci_check_and_mask_intx(assigned_dev->dev)) {
- assigned_dev->host_irq_disabled = true;
- ret = IRQ_WAKE_THREAD;
- } else
- ret = IRQ_NONE;
- spin_unlock(&assigned_dev->intx_lock);
-
- return ret;
-}
-
-static void
-kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
- int vector)
-{
- if (unlikely(assigned_dev->irq_requested_type &
- KVM_DEV_IRQ_GUEST_INTX)) {
- spin_lock(&assigned_dev->intx_mask_lock);
- if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
- kvm_set_irq(assigned_dev->kvm,
- assigned_dev->irq_source_id, vector, 1,
- false);
- spin_unlock(&assigned_dev->intx_mask_lock);
- } else
- kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
- vector, 1, false);
-}
-
-static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
-{
- struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
- if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
- spin_lock_irq(&assigned_dev->intx_lock);
- disable_irq_nosync(irq);
- assigned_dev->host_irq_disabled = true;
- spin_unlock_irq(&assigned_dev->intx_lock);
- }
-
- kvm_assigned_dev_raise_guest_irq(assigned_dev,
- assigned_dev->guest_irq);
-
- return IRQ_HANDLED;
-}
-
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- * Other values - No need to retry.
- */
-static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
- int level)
-{
- struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
- struct kvm_kernel_irq_routing_entry *e;
- int ret = -EINVAL;
- int idx;
-
- trace_kvm_set_irq(irq, level, irq_source_id);
-
- /*
- * Injection into either PIC or IOAPIC might need to scan all CPUs,
- * which would need to be retried from thread context; when same GSI
- * is connected to both PIC and IOAPIC, we'd have to report a
- * partial failure here.
- * Since there's no easy way to do this, we only support injecting MSI
- * which is limited to 1:1 GSI mapping.
- */
- idx = srcu_read_lock(&kvm->irq_srcu);
- if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
- e = &entries[0];
- ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
- irq, level);
- }
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
-
-
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
- struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
- int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
- assigned_dev->irq_source_id,
- assigned_dev->guest_irq, 1);
- return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
-{
- struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
- kvm_assigned_dev_raise_guest_irq(assigned_dev,
- assigned_dev->guest_irq);
-
- return IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
- struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
- int index = find_index_from_host_irq(assigned_dev, irq);
- u32 vector;
- int ret = 0;
-
- if (index >= 0) {
- vector = assigned_dev->guest_msix_entries[index].vector;
- ret = kvm_set_irq_inatomic(assigned_dev->kvm,
- assigned_dev->irq_source_id,
- vector, 1);
- }
-
- return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
-{
- struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
- int index = find_index_from_host_irq(assigned_dev, irq);
- u32 vector;
-
- if (index >= 0) {
- vector = assigned_dev->guest_msix_entries[index].vector;
- kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
- }
-
- return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
- struct kvm_assigned_dev_kernel *dev =
- container_of(kian, struct kvm_assigned_dev_kernel,
- ack_notifier);
-
- kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
-
- spin_lock(&dev->intx_mask_lock);
-
- if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
- bool reassert = false;
-
- spin_lock_irq(&dev->intx_lock);
- /*
- * The guest IRQ may be shared so this ack can come from an
- * IRQ for another guest device.
- */
- if (dev->host_irq_disabled) {
- if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
- enable_irq(dev->host_irq);
- else if (!pci_check_and_unmask_intx(dev->dev))
- reassert = true;
- dev->host_irq_disabled = reassert;
- }
- spin_unlock_irq(&dev->intx_lock);
-
- if (reassert)
- kvm_set_irq(dev->kvm, dev->irq_source_id,
- dev->guest_irq, 1, false);
- }
-
- spin_unlock(&dev->intx_mask_lock);
-}
-
-static void deassign_guest_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *assigned_dev)
-{
- if (assigned_dev->ack_notifier.gsi != -1)
- kvm_unregister_irq_ack_notifier(kvm,
- &assigned_dev->ack_notifier);
-
- kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
- assigned_dev->guest_irq, 0, false);
-
- if (assigned_dev->irq_source_id != -1)
- kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
- assigned_dev->irq_source_id = -1;
- assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *assigned_dev)
-{
- /*
- * We disable irq here to prevent further events.
- *
- * Notice this maybe result in nested disable if the interrupt type is
- * INTx, but it's OK for we are going to free it.
- *
- * If this function is a part of VM destroy, please ensure that till
- * now, the kvm state is still legal for probably we also have to wait
- * on a currently running IRQ handler.
- */
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
- int i;
- for (i = 0; i < assigned_dev->entries_nr; i++)
- disable_irq(assigned_dev->host_msix_entries[i].vector);
-
- for (i = 0; i < assigned_dev->entries_nr; i++)
- free_irq(assigned_dev->host_msix_entries[i].vector,
- assigned_dev);
-
- assigned_dev->entries_nr = 0;
- kfree(assigned_dev->host_msix_entries);
- kfree(assigned_dev->guest_msix_entries);
- pci_disable_msix(assigned_dev->dev);
- } else {
- /* Deal with MSI and INTx */
- if ((assigned_dev->irq_requested_type &
- KVM_DEV_IRQ_HOST_INTX) &&
- (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
- spin_lock_irq(&assigned_dev->intx_lock);
- pci_intx(assigned_dev->dev, false);
- spin_unlock_irq(&assigned_dev->intx_lock);
- synchronize_irq(assigned_dev->host_irq);
- } else
- disable_irq(assigned_dev->host_irq);
-
- free_irq(assigned_dev->host_irq, assigned_dev);
-
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
- pci_disable_msi(assigned_dev->dev);
- }
-
- assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-
-static int kvm_deassign_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *assigned_dev,
- unsigned long irq_requested_type)
-{
- unsigned long guest_irq_type, host_irq_type;
-
- if (!irqchip_in_kernel(kvm))
- return -EINVAL;
- /* no irq assignment to deassign */
- if (!assigned_dev->irq_requested_type)
- return -ENXIO;
-
- host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
- guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-
- if (host_irq_type)
- deassign_host_irq(kvm, assigned_dev);
- if (guest_irq_type)
- deassign_guest_irq(kvm, assigned_dev);
-
- return 0;
-}
-
-static void kvm_free_assigned_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *assigned_dev)
-{
- kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
- struct kvm_assigned_dev_kernel
- *assigned_dev)
-{
- kvm_free_assigned_irq(kvm, assigned_dev);
-
- pci_reset_function(assigned_dev->dev);
- if (pci_load_and_free_saved_state(assigned_dev->dev,
- &assigned_dev->pci_saved_state))
- printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
- __func__, dev_name(&assigned_dev->dev->dev));
- else
- pci_restore_state(assigned_dev->dev);
-
- pci_clear_dev_assigned(assigned_dev->dev);
-
- pci_release_regions(assigned_dev->dev);
- pci_disable_device(assigned_dev->dev);
- pci_dev_put(assigned_dev->dev);
-
- list_del(&assigned_dev->list);
- kfree(assigned_dev);
-}
-
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
- struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
-
- list_for_each_entry_safe(assigned_dev, tmp,
- &kvm->arch.assigned_dev_head, list) {
- kvm_free_assigned_device(kvm, assigned_dev);
- }
-}
-
-static int assigned_device_enable_host_intx(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev)
-{
- irq_handler_t irq_handler;
- unsigned long flags;
-
- dev->host_irq = dev->dev->irq;
-
- /*
- * We can only share the IRQ line with other host devices if we are
- * able to disable the IRQ source at device-level - independently of
- * the guest driver. Otherwise host devices may suffer from unbounded
- * IRQ latencies when the guest keeps the line asserted.
- */
- if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
- irq_handler = kvm_assigned_dev_intx;
- flags = IRQF_SHARED;
- } else {
- irq_handler = NULL;
- flags = IRQF_ONESHOT;
- }
- if (request_threaded_irq(dev->host_irq, irq_handler,
- kvm_assigned_dev_thread_intx, flags,
- dev->irq_name, dev))
- return -EIO;
-
- if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
- spin_lock_irq(&dev->intx_lock);
- pci_intx(dev->dev, true);
- spin_unlock_irq(&dev->intx_lock);
- }
- return 0;
-}
-
-static int assigned_device_enable_host_msi(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev)
-{
- int r;
-
- if (!dev->dev->msi_enabled) {
- r = pci_enable_msi(dev->dev);
- if (r)
- return r;
- }
-
- dev->host_irq = dev->dev->irq;
- if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
- kvm_assigned_dev_thread_msi, 0,
- dev->irq_name, dev)) {
- pci_disable_msi(dev->dev);
- return -EIO;
- }
-
- return 0;
-}
-
-static int assigned_device_enable_host_msix(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev)
-{
- int i, r = -EINVAL;
-
- /* host_msix_entries and guest_msix_entries should have been
- * initialized */
- if (dev->entries_nr == 0)
- return r;
-
- r = pci_enable_msix_exact(dev->dev,
- dev->host_msix_entries, dev->entries_nr);
- if (r)
- return r;
-
- for (i = 0; i < dev->entries_nr; i++) {
- r = request_threaded_irq(dev->host_msix_entries[i].vector,
- kvm_assigned_dev_msix,
- kvm_assigned_dev_thread_msix,
- 0, dev->irq_name, dev);
- if (r)
- goto err;
- }
-
- return 0;
-err:
- for (i -= 1; i >= 0; i--)
- free_irq(dev->host_msix_entries[i].vector, dev);
- pci_disable_msix(dev->dev);
- return r;
-}
-
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- struct kvm_assigned_irq *irq)
-{
- dev->guest_irq = irq->guest_irq;
- dev->ack_notifier.gsi = irq->guest_irq;
- return 0;
-}
-
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- struct kvm_assigned_irq *irq)
-{
- dev->guest_irq = irq->guest_irq;
- dev->ack_notifier.gsi = -1;
- return 0;
-}
-
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- struct kvm_assigned_irq *irq)
-{
- dev->guest_irq = irq->guest_irq;
- dev->ack_notifier.gsi = -1;
- return 0;
-}
-
-static int assign_host_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- __u32 host_irq_type)
-{
- int r = -EEXIST;
-
- if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
- return r;
-
- snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
- pci_name(dev->dev));
-
- switch (host_irq_type) {
- case KVM_DEV_IRQ_HOST_INTX:
- r = assigned_device_enable_host_intx(kvm, dev);
- break;
- case KVM_DEV_IRQ_HOST_MSI:
- r = assigned_device_enable_host_msi(kvm, dev);
- break;
- case KVM_DEV_IRQ_HOST_MSIX:
- r = assigned_device_enable_host_msix(kvm, dev);
- break;
- default:
- r = -EINVAL;
- }
- dev->host_irq_disabled = false;
-
- if (!r)
- dev->irq_requested_type |= host_irq_type;
-
- return r;
-}
-
-static int assign_guest_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- struct kvm_assigned_irq *irq,
- unsigned long guest_irq_type)
-{
- int id;
- int r = -EEXIST;
-
- if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
- return r;
-
- id = kvm_request_irq_source_id(kvm);
- if (id < 0)
- return id;
-
- dev->irq_source_id = id;
-
- switch (guest_irq_type) {
- case KVM_DEV_IRQ_GUEST_INTX:
- r = assigned_device_enable_guest_intx(kvm, dev, irq);
- break;
- case KVM_DEV_IRQ_GUEST_MSI:
- r = assigned_device_enable_guest_msi(kvm, dev, irq);
- break;
- case KVM_DEV_IRQ_GUEST_MSIX:
- r = assigned_device_enable_guest_msix(kvm, dev, irq);
- break;
- default:
- r = -EINVAL;
- }
-
- if (!r) {
- dev->irq_requested_type |= guest_irq_type;
- if (dev->ack_notifier.gsi != -1)
- kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
- } else {
- kvm_free_irq_source_id(kvm, dev->irq_source_id);
- dev->irq_source_id = -1;
- }
-
- return r;
-}
-
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
- struct kvm_assigned_irq *assigned_irq)
-{
- int r = -EINVAL;
- struct kvm_assigned_dev_kernel *match;
- unsigned long host_irq_type, guest_irq_type;
-
- if (!irqchip_in_kernel(kvm))
- return r;
-
- mutex_lock(&kvm->lock);
- r = -ENODEV;
- match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- assigned_irq->assigned_dev_id);
- if (!match)
- goto out;
-
- host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
- guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-
- r = -EINVAL;
- /* can only assign one type at a time */
- if (hweight_long(host_irq_type) > 1)
- goto out;
- if (hweight_long(guest_irq_type) > 1)
- goto out;
- if (host_irq_type == 0 && guest_irq_type == 0)
- goto out;
-
- r = 0;
- if (host_irq_type)
- r = assign_host_irq(kvm, match, host_irq_type);
- if (r)
- goto out;
-
- if (guest_irq_type)
- r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
- struct kvm_assigned_irq
- *assigned_irq)
-{
- int r = -ENODEV;
- struct kvm_assigned_dev_kernel *match;
- unsigned long irq_type;
-
- mutex_lock(&kvm->lock);
-
- match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- assigned_irq->assigned_dev_id);
- if (!match)
- goto out;
-
- irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
- KVM_DEV_IRQ_GUEST_MASK);
- r = kvm_deassign_irq(kvm, match, irq_type);
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-/*
- * We want to test whether the caller has been granted permissions to
- * use this device. To be able to configure and control the device,
- * the user needs access to PCI configuration space and BAR resources.
- * These are accessed through PCI sysfs. PCI config space is often
- * passed to the process calling this ioctl via file descriptor, so we
- * can't rely on access to that file. We can check for permissions
- * on each of the BAR resource files, which is a pretty clear
- * indicator that the user has been granted access to the device.
- */
-static int probe_sysfs_permissions(struct pci_dev *dev)
-{
-#ifdef CONFIG_SYSFS
- int i;
- bool bar_found = false;
-
- for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
- char *kpath, *syspath;
- struct path path;
- struct inode *inode;
- int r;
-
- if (!pci_resource_len(dev, i))
- continue;
-
- kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
- if (!kpath)
- return -ENOMEM;
-
- /* Per sysfs-rules, sysfs is always at /sys */
- syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
- kfree(kpath);
- if (!syspath)
- return -ENOMEM;
-
- r = kern_path(syspath, LOOKUP_FOLLOW, &path);
- kfree(syspath);
- if (r)
- return r;
-
- inode = d_backing_inode(path.dentry);
-
- r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
- path_put(&path);
- if (r)
- return r;
-
- bar_found = true;
- }
-
- /* If no resources, probably something special */
- if (!bar_found)
- return -EPERM;
-
- return 0;
-#else
- return -EINVAL; /* No way to control the device without sysfs */
-#endif
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
- struct kvm_assigned_pci_dev *assigned_dev)
-{
- int r = 0, idx;
- struct kvm_assigned_dev_kernel *match;
- struct pci_dev *dev;
-
- if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
- return -EINVAL;
-
- mutex_lock(&kvm->lock);
- idx = srcu_read_lock(&kvm->srcu);
-
- match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- assigned_dev->assigned_dev_id);
- if (match) {
- /* device already assigned */
- r = -EEXIST;
- goto out;
- }
-
- match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
- if (match == NULL) {
- printk(KERN_INFO "%s: Couldn't allocate memory\n",
- __func__);
- r = -ENOMEM;
- goto out;
- }
- dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
- assigned_dev->busnr,
- assigned_dev->devfn);
- if (!dev) {
- printk(KERN_INFO "%s: host device not found\n", __func__);
- r = -EINVAL;
- goto out_free;
- }
-
- /* Don't allow bridges to be assigned */
- if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
- r = -EPERM;
- goto out_put;
- }
-
- r = probe_sysfs_permissions(dev);
- if (r)
- goto out_put;
-
- if (pci_enable_device(dev)) {
- printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
- r = -EBUSY;
- goto out_put;
- }
- r = pci_request_regions(dev, "kvm_assigned_device");
- if (r) {
- printk(KERN_INFO "%s: Could not get access to device regions\n",
- __func__);
- goto out_disable;
- }
-
- pci_reset_function(dev);
- pci_save_state(dev);
- match->pci_saved_state = pci_store_saved_state(dev);
- if (!match->pci_saved_state)
- printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
- __func__, dev_name(&dev->dev));
-
- if (!pci_intx_mask_supported(dev))
- assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
-
- match->assigned_dev_id = assigned_dev->assigned_dev_id;
- match->host_segnr = assigned_dev->segnr;
- match->host_busnr = assigned_dev->busnr;
- match->host_devfn = assigned_dev->devfn;
- match->flags = assigned_dev->flags;
- match->dev = dev;
- spin_lock_init(&match->intx_lock);
- spin_lock_init(&match->intx_mask_lock);
- match->irq_source_id = -1;
- match->kvm = kvm;
- match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-
- list_add(&match->list, &kvm->arch.assigned_dev_head);
-
- if (!kvm->arch.iommu_domain) {
- r = kvm_iommu_map_guest(kvm);
- if (r)
- goto out_list_del;
- }
- r = kvm_assign_device(kvm, match->dev);
- if (r)
- goto out_list_del;
-
-out:
- srcu_read_unlock(&kvm->srcu, idx);
- mutex_unlock(&kvm->lock);
- return r;
-out_list_del:
- if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
- printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
- __func__, dev_name(&dev->dev));
- list_del(&match->list);
- pci_release_regions(dev);
-out_disable:
- pci_disable_device(dev);
-out_put:
- pci_dev_put(dev);
-out_free:
- kfree(match);
- srcu_read_unlock(&kvm->srcu, idx);
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
- struct kvm_assigned_pci_dev *assigned_dev)
-{
- int r = 0;
- struct kvm_assigned_dev_kernel *match;
-
- mutex_lock(&kvm->lock);
-
- match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- assigned_dev->assigned_dev_id);
- if (!match) {
- printk(KERN_INFO "%s: device hasn't been assigned before, "
- "so cannot be deassigned\n", __func__);
- r = -EINVAL;
- goto out;
- }
-
- kvm_deassign_device(kvm, match->dev);
-
- kvm_free_assigned_device(kvm, match);
-
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
- struct kvm_assigned_msix_nr *entry_nr)
-{
- int r = 0;
- struct kvm_assigned_dev_kernel *adev;
-
- mutex_lock(&kvm->lock);
-
- adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- entry_nr->assigned_dev_id);
- if (!adev) {
- r = -EINVAL;
- goto msix_nr_out;
- }
-
- if (adev->entries_nr == 0) {
- adev->entries_nr = entry_nr->entry_nr;
- if (adev->entries_nr == 0 ||
- adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
- r = -EINVAL;
- goto msix_nr_out;
- }
-
- adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
- entry_nr->entry_nr,
- GFP_KERNEL);
- if (!adev->host_msix_entries) {
- r = -ENOMEM;
- goto msix_nr_out;
- }
- adev->guest_msix_entries =
- kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
- GFP_KERNEL);
- if (!adev->guest_msix_entries) {
- kfree(adev->host_msix_entries);
- r = -ENOMEM;
- goto msix_nr_out;
- }
- } else /* Not allowed set MSI-X number twice */
- r = -EINVAL;
-msix_nr_out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
- struct kvm_assigned_msix_entry *entry)
-{
- int r = 0, i;
- struct kvm_assigned_dev_kernel *adev;
-
- mutex_lock(&kvm->lock);
-
- adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- entry->assigned_dev_id);
-
- if (!adev) {
- r = -EINVAL;
- goto msix_entry_out;
- }
-
- for (i = 0; i < adev->entries_nr; i++)
- if (adev->guest_msix_entries[i].vector == 0 ||
- adev->guest_msix_entries[i].entry == entry->entry) {
- adev->guest_msix_entries[i].entry = entry->entry;
- adev->guest_msix_entries[i].vector = entry->gsi;
- adev->host_msix_entries[i].entry = entry->entry;
- break;
- }
- if (i == adev->entries_nr) {
- r = -ENOSPC;
- goto msix_entry_out;
- }
-
-msix_entry_out:
- mutex_unlock(&kvm->lock);
-
- return r;
-}
-
-static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
- struct kvm_assigned_pci_dev *assigned_dev)
-{
- int r = 0;
- struct kvm_assigned_dev_kernel *match;
-
- mutex_lock(&kvm->lock);
-
- match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- assigned_dev->assigned_dev_id);
- if (!match) {
- r = -ENODEV;
- goto out;
- }
-
- spin_lock(&match->intx_mask_lock);
-
- match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
- match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
-
- if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
- if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
- kvm_set_irq(match->kvm, match->irq_source_id,
- match->guest_irq, 0, false);
- /*
- * Masking at hardware-level is performed on demand,
- * i.e. when an IRQ actually arrives at the host.
- */
- } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
- /*
- * Unmask the IRQ line if required. Unmasking at
- * device level will be performed by user space.
- */
- spin_lock_irq(&match->intx_lock);
- if (match->host_irq_disabled) {
- enable_irq(match->host_irq);
- match->host_irq_disabled = false;
- }
- spin_unlock_irq(&match->intx_lock);
- }
- }
-
- spin_unlock(&match->intx_mask_lock);
-
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
- unsigned long arg)
-{
- void __user *argp = (void __user *)arg;
- int r;
-
- switch (ioctl) {
- case KVM_ASSIGN_PCI_DEVICE: {
- struct kvm_assigned_pci_dev assigned_dev;
-
- r = -EFAULT;
- if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
- goto out;
- r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
- if (r)
- goto out;
- break;
- }
- case KVM_ASSIGN_IRQ: {
- r = -EOPNOTSUPP;
- break;
- }
- case KVM_ASSIGN_DEV_IRQ: {
- struct kvm_assigned_irq assigned_irq;
-
- r = -EFAULT;
- if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
- goto out;
- r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
- if (r)
- goto out;
- break;
- }
- case KVM_DEASSIGN_DEV_IRQ: {
- struct kvm_assigned_irq assigned_irq;
-
- r = -EFAULT;
- if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
- goto out;
- r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
- if (r)
- goto out;
- break;
- }
- case KVM_DEASSIGN_PCI_DEVICE: {
- struct kvm_assigned_pci_dev assigned_dev;
-
- r = -EFAULT;
- if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
- goto out;
- r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
- if (r)
- goto out;
- break;
- }
- case KVM_ASSIGN_SET_MSIX_NR: {
- struct kvm_assigned_msix_nr entry_nr;
- r = -EFAULT;
- if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
- goto out;
- r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
- if (r)
- goto out;
- break;
- }
- case KVM_ASSIGN_SET_MSIX_ENTRY: {
- struct kvm_assigned_msix_entry entry;
- r = -EFAULT;
- if (copy_from_user(&entry, argp, sizeof entry))
- goto out;
- r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
- if (r)
- goto out;
- break;
- }
- case KVM_ASSIGN_SET_INTX_MASK: {
- struct kvm_assigned_pci_dev assigned_dev;
-
- r = -EFAULT;
- if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
- goto out;
- r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
- break;
- }
- default:
- r = -ENOTTY;
- break;
- }
-out:
- return r;
-}
diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h
deleted file mode 100644
index a428c1a211b2..000000000000
--- a/arch/x86/kvm/assigned-dev.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H
-#define ARCH_X86_KVM_ASSIGNED_DEV_H
-
-#include <linux/kvm_host.h>
-
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev);
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev);
-
-int kvm_iommu_map_guest(struct kvm *kvm);
-int kvm_iommu_unmap_guest(struct kvm *kvm);
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
- unsigned long arg);
-
-void kvm_free_all_assigned_devices(struct kvm *kvm);
-#else
-static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
- return 0;
-}
-
-static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
- unsigned long arg)
-{
- return -ENOTTY;
-}
-
-static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
-#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */
-
-#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index efde6cc50875..59ca2eea522c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -780,18 +780,20 @@ out:
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
- int j, nent = vcpu->arch.cpuid_nent;
+ struct kvm_cpuid_entry2 *ej;
+ int j = i;
+ int nent = vcpu->arch.cpuid_nent;
e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
/* when no next entry is found, the current entry[i] is reselected */
- for (j = i + 1; ; j = (j + 1) % nent) {
- struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
- if (ej->function == e->function) {
- ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- return j;
- }
- }
- return 0; /* silence gcc, even though control never reaches here */
+ do {
+ j = (j + 1) % nent;
+ ej = &vcpu->arch.cpuid_entries[j];
+ } while (ej->function != e->function);
+
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+
+ return j;
}
/* find an entry with matching function, matching index (if needed), and that
@@ -876,6 +878,9 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 eax, ebx, ecx, edx;
+ if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
+ return 1;
+
eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 35058c2c0eea..da6728383052 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_RTM));
}
+static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_MPX));
+}
+
static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -205,4 +213,15 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
return x86_stepping(best->eax);
}
+static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
+}
+
+static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_misc_features_enables &
+ MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+}
+
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 45c7306c8780..fb0055953fbc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE) \
goto done; \
ctxt->_eip += sizeof(_type); \
- _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \
+ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \
ctxt->fetch.ptr += sizeof(_type); \
_x; \
})
@@ -2547,7 +2547,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
u64 smbase;
int ret;
- if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+ if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
return emulate_ud(ctxt);
/*
@@ -2596,11 +2596,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
return X86EMUL_UNHANDLEABLE;
}
- if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+ if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
ctxt->ops->set_nmi_mask(ctxt, false);
- ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
- ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
+ ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
+ ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
return X86EMUL_CONTINUE;
}
@@ -2742,6 +2742,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
}
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
return X86EMUL_CONTINUE;
}
@@ -3854,6 +3855,13 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
static int em_cpuid(struct x86_emulate_ctxt *ctxt)
{
u32 eax, ebx, ecx, edx;
+ u64 msr = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr);
+ if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ ctxt->ops->cpl(ctxt)) {
+ return emulate_gp(ctxt, 0);
+ }
eax = reg_read(ctxt, VCPU_REGS_RAX);
ecx = reg_read(ctxt, VCPU_REGS_RCX);
@@ -3934,6 +3942,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
}
/*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+ return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+ bool cr4_osfxsr;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return __fxstate_size(16);
+
+ cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+ return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
* FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
* 1) 16 bit mode
* 2) 32 bit mode
@@ -3954,7 +3981,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
static int em_fxsave(struct x86_emulate_ctxt *ctxt)
{
struct fxregs_state fx_state;
- size_t size;
int rc;
rc = check_fxsr(ctxt);
@@ -3970,68 +3996,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
- size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
- else
- size = offsetof(struct fxregs_state, xmm_space[0]);
-
- return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
-}
-
-static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
- struct fxregs_state *new)
-{
- int rc = X86EMUL_CONTINUE;
- struct fxregs_state old;
-
- rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- /*
- * 64 bit host will restore XMM 8-15, which is not correct on non-64
- * bit guests. Load the current values in order to preserve 64 bit
- * XMMs after fxrstor.
- */
-#ifdef CONFIG_X86_64
- /* XXX: accessing XMM 8-15 very awkwardly */
- memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
-#endif
-
- /*
- * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
- * does save and restore MXCSR.
- */
- if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
- memcpy(new->xmm_space, old.xmm_space, 8 * 16);
-
- return rc;
+ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+ fxstate_size(ctxt));
}
static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
{
struct fxregs_state fx_state;
int rc;
+ size_t size;
rc = check_fxsr(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ ctxt->ops->get_fpu(ctxt);
- if (fx_state.mxcsr >> 16)
- return emulate_gp(ctxt, 0);
+ size = fxstate_size(ctxt);
+ if (size < __fxstate_size(16)) {
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
+ }
- ctxt->ops->get_fpu(ctxt);
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
- if (ctxt->mode < X86EMUL_MODE_PROT64)
- rc = fxrstor_fixup(ctxt, &fx_state);
+ if (fx_state.mxcsr >> 16) {
+ rc = emulate_gp(ctxt, 0);
+ goto out;
+ }
if (rc == X86EMUL_CONTINUE)
rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+out:
ctxt->ops->put_fpu(ctxt);
return rc;
@@ -4166,7 +4166,7 @@ static int check_dr_write(struct x86_emulate_ctxt *ctxt)
static int check_svme(struct x86_emulate_ctxt *ctxt)
{
- u64 efer;
+ u64 efer = 0;
ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
@@ -5316,6 +5316,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
const struct x86_emulate_ops *ops = ctxt->ops;
int rc = X86EMUL_CONTINUE;
int saved_dst_type = ctxt->dst.type;
+ unsigned emul_flags;
ctxt->mem_read.pos = 0;
@@ -5330,6 +5331,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
+ emul_flags = ctxt->ops->get_hflags(ctxt);
if (unlikely(ctxt->d &
(No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
@@ -5363,7 +5365,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(ctxt, &ctxt->dst);
}
- if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5392,7 +5394,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5446,7 +5448,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 73ea24d4f119..bdcd4139eca9 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -49,7 +49,7 @@ static void pic_unlock(struct kvm_pic *s)
__releases(&s->lock)
{
bool wakeup = s->wakeup_needed;
- struct kvm_vcpu *vcpu, *found = NULL;
+ struct kvm_vcpu *vcpu;
int i;
s->wakeup_needed = false;
@@ -59,16 +59,11 @@ static void pic_unlock(struct kvm_pic *s)
if (wakeup) {
kvm_for_each_vcpu(i, vcpu, s->kvm) {
if (kvm_apic_accept_pic_intr(vcpu)) {
- found = vcpu;
- break;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
}
}
-
- if (!found)
- return;
-
- kvm_make_request(KVM_REQ_EVENT, found);
- kvm_vcpu_kick(found);
}
}
@@ -239,7 +234,7 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
int kvm_pic_read_irq(struct kvm *kvm)
{
int irq, irq2, intno;
- struct kvm_pic *s = pic_irqchip(kvm);
+ struct kvm_pic *s = kvm->arch.vpic;
s->output = 0;
@@ -273,7 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
return intno;
}
-void kvm_pic_reset(struct kvm_kpic_state *s)
+static void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq, i;
struct kvm_vcpu *vcpu;
@@ -422,19 +417,16 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
return ret;
}
-static u32 pic_ioport_read(void *opaque, u32 addr1)
+static u32 pic_ioport_read(void *opaque, u32 addr)
{
struct kvm_kpic_state *s = opaque;
- unsigned int addr;
int ret;
- addr = addr1;
- addr &= 1;
if (s->poll) {
- ret = pic_poll_read(s, addr1);
+ ret = pic_poll_read(s, addr);
s->poll = 0;
} else
- if (addr == 0)
+ if ((addr & 1) == 0)
if (s->read_reg_select)
ret = s->isr;
else
@@ -456,76 +448,64 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
return s->elcr;
}
-static int picdev_in_range(gpa_t addr)
-{
- switch (addr) {
- case 0x20:
- case 0x21:
- case 0xa0:
- case 0xa1:
- case 0x4d0:
- case 0x4d1:
- return 1;
- default:
- return 0;
- }
-}
-
static int picdev_write(struct kvm_pic *s,
gpa_t addr, int len, const void *val)
{
unsigned char data = *(unsigned char *)val;
- if (!picdev_in_range(addr))
- return -EOPNOTSUPP;
if (len != 1) {
pr_pic_unimpl("non byte write\n");
return 0;
}
- pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
case 0xa0:
case 0xa1:
+ pic_lock(s);
pic_ioport_write(&s->pics[addr >> 7], addr, data);
+ pic_unlock(s);
break;
case 0x4d0:
case 0x4d1:
+ pic_lock(s);
elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ pic_unlock(s);
break;
+ default:
+ return -EOPNOTSUPP;
}
- pic_unlock(s);
return 0;
}
static int picdev_read(struct kvm_pic *s,
gpa_t addr, int len, void *val)
{
- unsigned char data = 0;
- if (!picdev_in_range(addr))
- return -EOPNOTSUPP;
+ unsigned char *data = (unsigned char *)val;
if (len != 1) {
memset(val, 0, len);
pr_pic_unimpl("non byte read\n");
return 0;
}
- pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
case 0xa0:
case 0xa1:
- data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ pic_lock(s);
+ *data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ pic_unlock(s);
break;
case 0x4d0:
case 0x4d1:
- data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ pic_lock(s);
+ *data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ pic_unlock(s);
break;
+ default:
+ return -EOPNOTSUPP;
}
- *(unsigned char *)val = data;
- pic_unlock(s);
return 0;
}
@@ -576,7 +556,7 @@ static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
*/
static void pic_irq_request(struct kvm *kvm, int level)
{
- struct kvm_pic *s = pic_irqchip(kvm);
+ struct kvm_pic *s = kvm->arch.vpic;
if (!s->output)
s->wakeup_needed = true;
@@ -657,9 +637,14 @@ void kvm_pic_destroy(struct kvm *kvm)
{
struct kvm_pic *vpic = kvm->arch.vpic;
+ if (!vpic)
+ return;
+
+ mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+ mutex_unlock(&kvm->slots_lock);
kvm->arch.vpic = NULL;
kfree(vpic);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 6e219e5c07d2..bdff437acbcb 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -266,11 +266,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
spin_unlock(&ioapic->lock);
}
-void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-
- if (!ioapic)
+ if (!ioapic_in_kernel(kvm))
return;
kvm_make_scan_ioapic_request(kvm);
}
@@ -315,7 +313,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
&& ioapic->irr & (1 << index))
ioapic_service(ioapic, index, false);
- kvm_vcpu_request_scan_ioapic(ioapic->kvm);
+ kvm_make_scan_ioapic_request(ioapic->kvm);
break;
}
}
@@ -624,10 +622,8 @@ int kvm_ioapic_init(struct kvm *kvm)
if (ret < 0) {
kvm->arch.vioapic = NULL;
kfree(ioapic);
- return ret;
}
- kvm_vcpu_request_scan_ioapic(kvm);
return ret;
}
@@ -635,37 +631,36 @@ void kvm_ioapic_destroy(struct kvm *kvm)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ if (!ioapic)
+ return;
+
cancel_delayed_work_sync(&ioapic->eoi_inject);
+ mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
kvm->arch.vioapic = NULL;
kfree(ioapic);
}
-int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
{
- struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
- if (!ioapic)
- return -EINVAL;
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
spin_lock(&ioapic->lock);
memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
state->irr &= ~ioapic->irr_delivered;
spin_unlock(&ioapic->lock);
- return 0;
}
-int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
{
- struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
- if (!ioapic)
- return -EINVAL;
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
spin_lock(&ioapic->lock);
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
ioapic->irr = 0;
ioapic->irr_delivered = 0;
- kvm_vcpu_request_scan_ioapic(kvm);
+ kvm_make_scan_ioapic_request(kvm);
kvm_ioapic_inject_all(ioapic, state->irr);
spin_unlock(&ioapic->lock);
- return 0;
}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 1cc6e54436db..29ce19732ccf 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -105,17 +105,13 @@ do { \
#define ASSERT(x) do { } while (0)
#endif
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
- return kvm->arch.vioapic;
-}
-
static inline int ioapic_in_kernel(struct kvm *kvm)
{
- int ret;
+ int mode = kvm->arch.irqchip_mode;
- ret = (ioapic_irqchip(kvm) != NULL);
- return ret;
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -132,8 +128,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
-int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors);
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
deleted file mode 100644
index b181426f67b4..000000000000
--- a/arch/x86/kvm/iommu.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Author: Allen M. Kay <allen.m.kay@intel.com>
- * Author: Weidong Han <weidong.han@intel.com>
- * Author: Ben-Ami Yassour <benami@il.ibm.com>
- */
-
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/stat.h>
-#include <linux/iommu.h>
-#include "assigned-dev.h"
-
-static bool allow_unsafe_assigned_interrupts;
-module_param_named(allow_unsafe_assigned_interrupts,
- allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
- "Enable device assignment on platforms without interrupt remapping support.");
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm);
-static void kvm_iommu_put_pages(struct kvm *kvm,
- gfn_t base_gfn, unsigned long npages);
-
-static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
- unsigned long npages)
-{
- gfn_t end_gfn;
- kvm_pfn_t pfn;
-
- pfn = gfn_to_pfn_memslot(slot, gfn);
- end_gfn = gfn + npages;
- gfn += 1;
-
- if (is_error_noslot_pfn(pfn))
- return pfn;
-
- while (gfn < end_gfn)
- gfn_to_pfn_memslot(slot, gfn++);
-
- return pfn;
-}
-
-static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
- unsigned long npages)
-{
- unsigned long i;
-
- for (i = 0; i < npages; ++i)
- kvm_release_pfn_clean(pfn + i);
-}
-
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
- gfn_t gfn, end_gfn;
- kvm_pfn_t pfn;
- int r = 0;
- struct iommu_domain *domain = kvm->arch.iommu_domain;
- int flags;
-
- /* check if iommu exists and in use */
- if (!domain)
- return 0;
-
- gfn = slot->base_gfn;
- end_gfn = gfn + slot->npages;
-
- flags = IOMMU_READ;
- if (!(slot->flags & KVM_MEM_READONLY))
- flags |= IOMMU_WRITE;
- if (!kvm->arch.iommu_noncoherent)
- flags |= IOMMU_CACHE;
-
-
- while (gfn < end_gfn) {
- unsigned long page_size;
-
- /* Check if already mapped */
- if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
- gfn += 1;
- continue;
- }
-
- /* Get the page size we could use to map */
- page_size = kvm_host_page_size(kvm, gfn);
-
- /* Make sure the page_size does not exceed the memslot */
- while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
- page_size >>= 1;
-
- /* Make sure gfn is aligned to the page size we want to map */
- while ((gfn << PAGE_SHIFT) & (page_size - 1))
- page_size >>= 1;
-
- /* Make sure hva is aligned to the page size we want to map */
- while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
- page_size >>= 1;
-
- /*
- * Pin all pages we are about to map in memory. This is
- * important because we unmap and unpin in 4kb steps later.
- */
- pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT);
- if (is_error_noslot_pfn(pfn)) {
- gfn += 1;
- continue;
- }
-
- /* Map into IO address space */
- r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
- page_size, flags);
- if (r) {
- printk(KERN_ERR "kvm_iommu_map_address:"
- "iommu failed to map pfn=%llx\n", pfn);
- kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT);
- goto unmap_pages;
- }
-
- gfn += page_size >> PAGE_SHIFT;
-
- cond_resched();
- }
-
- return 0;
-
-unmap_pages:
- kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn);
- return r;
-}
-
-static int kvm_iommu_map_memslots(struct kvm *kvm)
-{
- int idx, r = 0;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
-
- if (kvm->arch.iommu_noncoherent)
- kvm_arch_register_noncoherent_dma(kvm);
-
- idx = srcu_read_lock(&kvm->srcu);
- slots = kvm_memslots(kvm);
-
- kvm_for_each_memslot(memslot, slots) {
- r = kvm_iommu_map_pages(kvm, memslot);
- if (r)
- break;
- }
- srcu_read_unlock(&kvm->srcu, idx);
-
- return r;
-}
-
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
- struct iommu_domain *domain = kvm->arch.iommu_domain;
- int r;
- bool noncoherent;
-
- /* check if iommu exists and in use */
- if (!domain)
- return 0;
-
- if (pdev == NULL)
- return -ENODEV;
-
- r = iommu_attach_device(domain, &pdev->dev);
- if (r) {
- dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
- return r;
- }
-
- noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY);
-
- /* Check if need to update IOMMU page table for guest memory */
- if (noncoherent != kvm->arch.iommu_noncoherent) {
- kvm_iommu_unmap_memslots(kvm);
- kvm->arch.iommu_noncoherent = noncoherent;
- r = kvm_iommu_map_memslots(kvm);
- if (r)
- goto out_unmap;
- }
-
- kvm_arch_start_assignment(kvm);
- pci_set_dev_assigned(pdev);
-
- dev_info(&pdev->dev, "kvm assign device\n");
-
- return 0;
-out_unmap:
- kvm_iommu_unmap_memslots(kvm);
- return r;
-}
-
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
- struct iommu_domain *domain = kvm->arch.iommu_domain;
-
- /* check if iommu exists and in use */
- if (!domain)
- return 0;
-
- if (pdev == NULL)
- return -ENODEV;
-
- iommu_detach_device(domain, &pdev->dev);
-
- pci_clear_dev_assigned(pdev);
- kvm_arch_end_assignment(kvm);
-
- dev_info(&pdev->dev, "kvm deassign device\n");
-
- return 0;
-}
-
-int kvm_iommu_map_guest(struct kvm *kvm)
-{
- int r;
-
- if (!iommu_present(&pci_bus_type)) {
- printk(KERN_ERR "%s: iommu not found\n", __func__);
- return -ENODEV;
- }
-
- mutex_lock(&kvm->slots_lock);
-
- kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
- if (!kvm->arch.iommu_domain) {
- r = -ENOMEM;
- goto out_unlock;
- }
-
- if (!allow_unsafe_assigned_interrupts &&
- !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) {
- printk(KERN_WARNING "%s: No interrupt remapping support,"
- " disallowing device assignment."
- " Re-enable with \"allow_unsafe_assigned_interrupts=1\""
- " module option.\n", __func__);
- iommu_domain_free(kvm->arch.iommu_domain);
- kvm->arch.iommu_domain = NULL;
- r = -EPERM;
- goto out_unlock;
- }
-
- r = kvm_iommu_map_memslots(kvm);
- if (r)
- kvm_iommu_unmap_memslots(kvm);
-
-out_unlock:
- mutex_unlock(&kvm->slots_lock);
- return r;
-}
-
-static void kvm_iommu_put_pages(struct kvm *kvm,
- gfn_t base_gfn, unsigned long npages)
-{
- struct iommu_domain *domain;
- gfn_t end_gfn, gfn;
- kvm_pfn_t pfn;
- u64 phys;
-
- domain = kvm->arch.iommu_domain;
- end_gfn = base_gfn + npages;
- gfn = base_gfn;
-
- /* check if iommu exists and in use */
- if (!domain)
- return;
-
- while (gfn < end_gfn) {
- unsigned long unmap_pages;
- size_t size;
-
- /* Get physical address */
- phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-
- if (!phys) {
- gfn++;
- continue;
- }
-
- pfn = phys >> PAGE_SHIFT;
-
- /* Unmap address from IO address space */
- size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
- unmap_pages = 1ULL << get_order(size);
-
- /* Unpin all pages we just unmapped to not leak any memory */
- kvm_unpin_pages(kvm, pfn, unmap_pages);
-
- gfn += unmap_pages;
-
- cond_resched();
- }
-}
-
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
- kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
-}
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-{
- int idx;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
-
- idx = srcu_read_lock(&kvm->srcu);
- slots = kvm_memslots(kvm);
-
- kvm_for_each_memslot(memslot, slots)
- kvm_iommu_unmap_pages(kvm, memslot);
-
- srcu_read_unlock(&kvm->srcu, idx);
-
- if (kvm->arch.iommu_noncoherent)
- kvm_arch_unregister_noncoherent_dma(kvm);
-
- return 0;
-}
-
-int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
- struct iommu_domain *domain = kvm->arch.iommu_domain;
-
- /* check if iommu exists and in use */
- if (!domain)
- return 0;
-
- mutex_lock(&kvm->slots_lock);
- kvm_iommu_unmap_memslots(kvm);
- kvm->arch.iommu_domain = NULL;
- kvm->arch.iommu_noncoherent = false;
- mutex_unlock(&kvm->slots_lock);
-
- iommu_domain_free(domain);
- return 0;
-}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 60d91c9d160c..5c24811e8b0b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -60,7 +60,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (irqchip_split(v->kvm))
return pending_userspace_extint(v);
else
- return pic_irqchip(v->kvm)->output;
+ return v->kvm->arch.vpic->output;
} else
return 0;
}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 40d5b2cf6061..d5005cc26521 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -78,40 +78,42 @@ void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
- return kvm->arch.vpic;
-}
-
static inline int pic_in_kernel(struct kvm *kvm)
{
- int ret;
+ int mode = kvm->arch.irqchip_mode;
- ret = (pic_irqchip(kvm) != NULL);
- return ret;
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
}
static inline int irqchip_split(struct kvm *kvm)
{
- return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT;
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
}
static inline int irqchip_kernel(struct kvm *kvm)
{
- return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL;
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
}
static inline int irqchip_in_kernel(struct kvm *kvm)
{
- bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE;
+ int mode = kvm->arch.irqchip_mode;
- /* Matches with wmb after initializing kvm->irq_routing. */
+ /* Matches smp_wmb() when setting irqchip_mode */
smp_rmb();
- return ret;
+ return mode != KVM_IRQCHIP_NONE;
}
-void kvm_pic_reset(struct kvm_kpic_state *s);
-
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6825cd36d13b..3cc3b2d130a0 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -42,7 +42,7 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
{
- struct kvm_pic *pic = pic_irqchip(kvm);
+ struct kvm_pic *pic = kvm->arch.vpic;
return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
}
@@ -232,11 +232,11 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
- if (!ioapic_in_kernel(kvm))
+ if (!irqchip_kernel(kvm))
goto unlock;
kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
- kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);
+ kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
unlock:
mutex_unlock(&kvm->irq_lock);
}
@@ -274,42 +274,42 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
srcu_read_unlock(&kvm->irq_srcu, idx);
}
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
- int r = -EINVAL;
- int delta;
- unsigned max_pin;
-
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently inititalizing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
switch (ue->type) {
case KVM_IRQ_ROUTING_IRQCHIP:
- delta = 0;
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_SLAVE:
- delta = 8;
+ e->irqchip.pin += PIC_NUM_PINS / 2;
/* fall through */
case KVM_IRQCHIP_PIC_MASTER:
- if (!pic_in_kernel(kvm))
- goto out;
-
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
e->set = kvm_set_pic_irq;
- max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_IOAPIC:
- if (!ioapic_in_kernel(kvm))
- goto out;
-
- max_pin = KVM_IOAPIC_NUM_PINS;
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
e->set = kvm_set_ioapic_irq;
break;
default:
- goto out;
+ return -EINVAL;
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
- e->irqchip.pin = ue->u.irqchip.pin + delta;
- if (e->irqchip.pin >= max_pin)
- goto out;
break;
case KVM_IRQ_ROUTING_MSI:
e->set = kvm_set_msi;
@@ -318,7 +318,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
e->msi.data = ue->u.msi.data;
if (kvm_msi_route_invalid(kvm, e))
- goto out;
+ return -EINVAL;
break;
case KVM_IRQ_ROUTING_HV_SINT:
e->set = kvm_hv_set_sint;
@@ -326,12 +326,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
e->hv_sint.sint = ue->u.hv_sint.sint;
break;
default:
- goto out;
+ return -EINVAL;
}
- r = 0;
-out:
- return r;
+ return 0;
}
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bad6a25067bc..2819d4c123eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -177,8 +177,8 @@ static void recalculate_apic_map(struct kvm *kvm)
if (kvm_apic_present(vcpu))
max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
- new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
- sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
+ new = kvzalloc(sizeof(struct kvm_apic_map) +
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
if (!new)
goto out;
@@ -529,14 +529,16 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
{
- return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
- sizeof(val));
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
}
static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
{
- return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
- sizeof(*val));
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
}
static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -1493,31 +1495,65 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
static void cancel_hv_timer(struct kvm_lapic *apic)
{
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ preempt_disable();
kvm_x86_ops->cancel_hv_timer(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
+ preempt_enable();
}
static bool start_hv_timer(struct kvm_lapic *apic)
{
- u64 tscdeadline = apic->lapic_timer.tscdeadline;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ int r;
- if ((atomic_read(&apic->lapic_timer.pending) &&
- !apic_lvtt_period(apic)) ||
- kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
- if (apic->lapic_timer.hv_timer_in_use)
- cancel_hv_timer(apic);
- } else {
- apic->lapic_timer.hv_timer_in_use = true;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ if (!kvm_x86_ops->set_hv_timer)
+ return false;
- /* In case the sw timer triggered in the window */
- if (atomic_read(&apic->lapic_timer.pending) &&
- !apic_lvtt_period(apic))
- cancel_hv_timer(apic);
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return false;
+
+ r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
+ if (r < 0)
+ return false;
+
+ ktimer->hv_timer_in_use = true;
+ hrtimer_cancel(&ktimer->timer);
+
+ /*
+ * Also recheck ktimer->pending, in case the sw timer triggered in
+ * the window. For periodic timer, leave the hv timer running for
+ * simplicity, and the deadline will be recomputed on the next vmexit.
+ */
+ if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) {
+ if (r)
+ apic_timer_expired(apic);
+ return false;
}
- trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
- apic->lapic_timer.hv_timer_in_use);
- return apic->lapic_timer.hv_timer_in_use;
+
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true);
+ return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ start_sw_period(apic);
+ else if (apic_lvtt_tscdeadline(apic))
+ start_sw_tscdeadline(apic);
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+ if (!start_hv_timer(apic))
+ start_sw_timer(apic);
}
void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
@@ -1531,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
advance_periodic_target_expiration(apic);
- if (!start_hv_timer(apic))
- start_sw_period(apic);
+ restart_apic_timer(apic);
}
}
EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- WARN_ON(apic->lapic_timer.hv_timer_in_use);
-
- start_hv_timer(apic);
+ restart_apic_timer(vcpu->arch.apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
@@ -1552,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
/* Possibly the TSC deadline timer is not enabled yet */
- if (!apic->lapic_timer.hv_timer_in_use)
- return;
-
- cancel_hv_timer(apic);
+ if (apic->lapic_timer.hv_timer_in_use)
+ start_sw_timer(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
- if (atomic_read(&apic->lapic_timer.pending))
- return;
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
- start_sw_period(apic);
- else if (apic_lvtt_tscdeadline(apic))
- start_sw_tscdeadline(apic);
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ restart_apic_timer(apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
static void start_apic_timer(struct kvm_lapic *apic)
{
atomic_set(&apic->lapic_timer.pending, 0);
- if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
- if (set_target_expiration(apic) &&
- !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
- start_sw_period(apic);
- } else if (apic_lvtt_tscdeadline(apic)) {
- if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
- start_sw_tscdeadline(apic);
- }
+ if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ && !set_target_expiration(apic))
+ return;
+
+ restart_apic_timer(apic);
}
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -1809,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
* LAPIC interface
*----------------------------------------------------------------------
*/
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (!lapic_in_kernel(vcpu))
- return 0;
-
- return apic->lapic_timer.tscdeadline;
-}
-
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1932,7 +1948,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
for (i = 0; i < KVM_APIC_LVT_NUM; i++)
kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
apic_update_lvtt(apic);
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+ if (kvm_vcpu_is_reset_bsp(vcpu) &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
kvm_lapic_set_reg(apic, APIC_LVT0,
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
@@ -2285,8 +2302,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
- sizeof(u32)))
+ if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32)))
return;
apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2338,14 +2355,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
max_isr = 0;
data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
- kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
- sizeof(u32));
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
}
int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
{
if (vapic_addr) {
- if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.apic->vapic_cache,
vapic_addr, sizeof(u32)))
return -EINVAL;
@@ -2439,7 +2456,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
vcpu->arch.pv_eoi.msr_val = data;
if (!pv_eoi_enabled(vcpu))
return 0;
- return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
+ return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
addr, sizeof(u8));
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index bcbe811f3b97..29caa2c3dff9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
@@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ac7810513d0e..aafd399cf8c6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -183,13 +183,13 @@ static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_mmio_value;
static u64 __read_mostly shadow_present_mask;
/*
- * The mask/value to distinguish a PTE that has been marked not-present for
- * access tracking purposes.
- * The mask would be either 0 if access tracking is disabled, or
- * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
+ * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
+ * Non-present SPTEs with shadow_acc_track_value set are in place for access
+ * tracking.
*/
static u64 __read_mostly shadow_acc_track_mask;
static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
@@ -207,16 +207,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF
static void mmu_spte_set(u64 *sptep, u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
{
+ BUG_ON((mmio_mask & mmio_value) != mmio_value);
+ shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+{
+ return sp->role.ad_disabled;
+}
+
+static inline bool spte_ad_enabled(u64 spte)
+{
+ MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ return !(spte & shadow_acc_track_value);
+}
+
+static inline u64 spte_shadow_accessed_mask(u64 spte)
+{
+ MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
+}
+
+static inline u64 spte_shadow_dirty_mask(u64 spte)
+{
+ MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
+}
+
static inline bool is_access_track_spte(u64 spte)
{
- /* Always false if shadow_acc_track_mask is zero. */
- return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+ return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
/*
@@ -270,7 +294,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
u64 mask = generation_mmio_spte_mask(gen);
access &= ACC_WRITE_MASK | ACC_USER_MASK;
- mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
+ mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
@@ -278,7 +302,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
static bool is_mmio_spte(u64 spte)
{
- return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+ return (spte & shadow_mmio_mask) == shadow_mmio_value;
}
static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -315,12 +339,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
return likely(kvm_gen == spte_gen);
}
+/*
+ * Sets the shadow PTE masks used by the MMU.
+ *
+ * Assumptions:
+ * - Setting either @accessed_mask or @dirty_mask requires setting both
+ * - At least one of @accessed_mask or @acc_track_mask must be set
+ */
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
u64 acc_track_mask)
{
- if (acc_track_mask != 0)
- acc_track_mask |= SPTE_SPECIAL_MASK;
+ BUG_ON(!dirty_mask != !accessed_mask);
+ BUG_ON(!accessed_mask && !acc_track_mask);
+ BUG_ON(acc_track_mask & shadow_acc_track_value);
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
@@ -329,7 +361,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
shadow_x_mask = x_mask;
shadow_present_mask = p_mask;
shadow_acc_track_mask = acc_track_mask;
- WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -549,7 +580,7 @@ static bool spte_has_volatile_bits(u64 spte)
is_access_track_spte(spte))
return true;
- if (shadow_accessed_mask) {
+ if (spte_ad_enabled(spte)) {
if ((spte & shadow_accessed_mask) == 0 ||
(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
return true;
@@ -560,14 +591,17 @@ static bool spte_has_volatile_bits(u64 spte)
static bool is_accessed_spte(u64 spte)
{
- return shadow_accessed_mask ? spte & shadow_accessed_mask
- : !is_access_track_spte(spte);
+ u64 accessed_mask = spte_shadow_accessed_mask(spte);
+
+ return accessed_mask ? spte & accessed_mask
+ : !is_access_track_spte(spte);
}
static bool is_dirty_spte(u64 spte)
{
- return shadow_dirty_mask ? spte & shadow_dirty_mask
- : spte & PT_WRITABLE_MASK;
+ u64 dirty_mask = spte_shadow_dirty_mask(spte);
+
+ return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
}
/* Rules for using mmu_spte_set:
@@ -707,10 +741,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
static u64 mark_spte_for_access_track(u64 spte)
{
- if (shadow_accessed_mask != 0)
+ if (spte_ad_enabled(spte))
return spte & ~shadow_accessed_mask;
- if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
+ if (is_access_track_spte(spte))
return spte;
/*
@@ -729,7 +763,6 @@ static u64 mark_spte_for_access_track(u64 spte)
spte |= (spte & shadow_acc_track_saved_bits_mask) <<
shadow_acc_track_saved_bits_shift;
spte &= ~shadow_acc_track_mask;
- spte |= shadow_acc_track_value;
return spte;
}
@@ -741,6 +774,7 @@ static u64 restore_acc_track_spte(u64 spte)
u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
& shadow_acc_track_saved_bits_mask;
+ WARN_ON_ONCE(spte_ad_enabled(spte));
WARN_ON_ONCE(!is_access_track_spte(spte));
new_spte &= ~shadow_acc_track_mask;
@@ -759,7 +793,7 @@ static bool mmu_spte_age(u64 *sptep)
if (!is_accessed_spte(spte))
return false;
- if (shadow_accessed_mask) {
+ if (spte_ad_enabled(spte)) {
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
} else {
@@ -1390,6 +1424,22 @@ static bool spte_clear_dirty(u64 *sptep)
return mmu_spte_update(sptep, spte);
}
+static bool wrprot_ad_disabled_spte(u64 *sptep)
+{
+ bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
+ if (was_writable)
+ kvm_set_pfn_dirty(spte_to_pfn(*sptep));
+
+ return was_writable;
+}
+
+/*
+ * Gets the GFN ready for another round of dirty logging by clearing the
+ * - D bit on ad-enabled SPTEs, and
+ * - W bit on ad-disabled SPTEs.
+ * Returns true iff any D or W bits were cleared.
+ */
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
@@ -1397,7 +1447,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_clear_dirty(sptep);
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_clear_dirty(sptep);
+ else
+ flush |= wrprot_ad_disabled_spte(sptep);
return flush;
}
@@ -1420,7 +1473,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_set_dirty(sptep);
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_set_dirty(sptep);
return flush;
}
@@ -1452,7 +1506,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
/**
- * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
+ * protect the page if the D-bit isn't supported.
* @kvm: kvm instance
* @slot: slot to clear D-bit
* @gfn_offset: start of the BITS_PER_LONG pages we care about
@@ -1498,6 +1553,21 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
+/**
+ * kvm_arch_write_log_dirty - emulate dirty page logging
+ * @vcpu: Guest mode vcpu
+ *
+ * Emulate arch specific page modification logging for the
+ * nested hypervisor
+ */
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->write_log_dirty)
+ return kvm_x86_ops->write_log_dirty(vcpu);
+
+ return 0;
+}
+
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn)
{
@@ -1751,18 +1821,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
u64 *sptep;
struct rmap_iterator iter;
- /*
- * If there's no access bit in the secondary pte set by the hardware and
- * fast access tracking is also not enabled, it's up to gup-fast/gup to
- * set the access bit in the primary pte or in the page structure.
- */
- if (!shadow_accessed_mask && !shadow_acc_track_mask)
- goto out;
-
for_each_rmap_spte(rmap_head, &iter, sptep)
if (is_accessed_spte(*sptep))
return 1;
-out:
return 0;
}
@@ -1783,18 +1844,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- /*
- * In case of absence of EPT Access and Dirty Bits supports,
- * emulate the accessed bit for EPT, by checking if this page has
- * an EPT mapping, and clearing it if it does. On the next access,
- * a new EPT mapping will be established.
- * This has some overhead, but not as much as the cost of swapping
- * out actively used pages or breaking up actively used hugepages.
- */
- if (!shadow_accessed_mask && !shadow_acc_track_mask)
- return kvm_handle_hva_range(kvm, start, end, 0,
- kvm_unmap_rmapp);
-
return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
}
@@ -2383,7 +2432,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+ shadow_user_mask | shadow_x_mask;
+
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+ else
+ spte |= shadow_accessed_mask;
mmu_spte_set(sptep, spte);
@@ -2651,10 +2705,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
{
u64 spte = 0;
int ret = 0;
+ struct kvm_mmu_page *sp;
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
+ sp = page_header(__pa(sptep));
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+
/*
* For the EPT case, shadow_present_mask is 0 if hardware
* supports exec-only page table entries. In that case,
@@ -2663,7 +2722,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
*/
spte |= shadow_present_mask;
if (!speculative)
- spte |= shadow_accessed_mask;
+ spte |= spte_shadow_accessed_mask(spte);
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
@@ -2720,7 +2779,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (pte_access & ACC_WRITE_MASK) {
kvm_vcpu_mark_page_dirty(vcpu, gfn);
- spte |= shadow_dirty_mask;
+ spte |= spte_shadow_dirty_mask(spte);
}
if (speculative)
@@ -2862,16 +2921,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
{
struct kvm_mmu_page *sp;
+ sp = page_header(__pa(sptep));
+
/*
- * Since it's no accessed bit on EPT, it's no way to
- * distinguish between actually accessed translations
- * and prefetched, so disable pte prefetch if EPT is
- * enabled.
+ * Without accessed bits, there's no way to distinguish between
+ * actually accessed translations and prefetched, so disable pte
+ * prefetch if accessed bits aren't available.
*/
- if (!shadow_accessed_mask)
+ if (sp_ad_disabled(sp))
return;
- sp = page_header(__pa(sptep));
if (sp->role.level > PT_PAGE_TABLE_LEVEL)
return;
@@ -3683,12 +3742,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
-static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
{
if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu)))
return false;
+ if (is_guest_mode(vcpu))
+ return false;
+
return kvm_x86_ops->interrupt_allowed(vcpu);
}
@@ -3704,7 +3766,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (!async)
return false; /* *pfn has correct page already */
- if (!prefault && can_do_async_pf(vcpu)) {
+ if (!prefault && kvm_can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(gva, gfn);
@@ -4272,6 +4334,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->base_role.word = 0;
context->base_role.smm = is_smm(vcpu);
+ context->base_role.ad_disabled = (shadow_accessed_mask == 0);
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
@@ -4340,7 +4403,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ bool accessed_dirty)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
@@ -4349,6 +4413,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
context->nx = true;
+ context->ept_ad = accessed_dirty;
context->page_fault = ept_page_fault;
context->gva_to_gpa = ept_gva_to_gpa;
context->sync_page = ept_sync_page;
@@ -4357,6 +4422,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
context->root_level = context->shadow_root_level;
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
+ context->base_role.ad_disabled = !accessed_dirty;
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
@@ -4616,6 +4682,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mask.smep_andnot_wp = 1;
mask.smap_andnot_wp = 1;
mask.smm = 1;
+ mask.ad_disabled = 1;
/*
* If we don't have indirect shadow pages, it means no page is
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index ddc56e91f2e4..a276834950c1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL << (e - s + 1)) - 1) << s;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
@@ -74,7 +74,9 @@ enum {
int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ bool accessed_dirty);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
@@ -201,4 +203,5 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 5a24b846a1cb..8b97a6cba8d1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -30,8 +30,9 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \
- " %snxe root %u %s%c", __entry->mmu_valid_gen, \
+ trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
+ " %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
__entry->gfn, role.level, \
role.cr4_pae ? " pae" : "", \
role.quadrant, \
@@ -39,6 +40,7 @@
access_str[role.access], \
role.invalid ? " invalid" : "", \
role.nxe ? "" : "!", \
+ role.ad_disabled ? "!" : "", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
saved_ptr; \
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 37942e419c32..ea67dc876316 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -40,8 +40,8 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
int i;
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
- slot->arch.gfn_track[i] = kvm_kvzalloc(npages *
- sizeof(*slot->arch.gfn_track[i]));
+ slot->arch.gfn_track[i] = kvzalloc(npages *
+ sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL);
if (!slot->arch.gfn_track[i])
goto track_free;
}
@@ -160,6 +160,14 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
}
+void kvm_page_track_cleanup(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ cleanup_srcu_struct(&head->track_srcu);
+}
+
void kvm_page_track_init(struct kvm *kvm)
{
struct kvm_page_track_notifier_head *head;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a01105485315..b0454c7e4cff 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,13 +23,6 @@
* so the code in this file is compiled twice, once per pte size.
*/
-/*
- * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
- * uses for EPT without A/D paging type.
- */
-extern u64 __pure __using_nonexistent_pte_bit(void)
- __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
-
#if PTTYPE == 64
#define pt_element_t u64
#define guest_walker guest_walker64
@@ -39,10 +32,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
- #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
- #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4
#define CMPXCHG cmpxchg
@@ -60,10 +52,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
- #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
- #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
#define CMPXCHG cmpxchg
#elif PTTYPE == PTTYPE_EPT
#define pt_element_t u64
@@ -74,16 +65,18 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
- #define PT_GUEST_ACCESSED_MASK 0
- #define PT_GUEST_DIRTY_MASK 0
- #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
- #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+ #define PT_GUEST_DIRTY_SHIFT 9
+ #define PT_GUEST_ACCESSED_SHIFT 8
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
#define CMPXCHG cmpxchg64
#define PT_MAX_FULL_LEVELS 4
#else
#error Invalid PTTYPE value
#endif
+#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
@@ -111,12 +104,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
}
-static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+ unsigned gpte)
{
unsigned mask;
/* dirty bit is not supported, so no need to track it */
- if (!PT_GUEST_DIRTY_MASK)
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
return;
BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
@@ -171,7 +165,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
goto no_present;
/* if accessed bit is not supported prefetch non accessed gpte */
- if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+ if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK))
goto no_present;
return false;
@@ -217,7 +211,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
int ret;
/* dirty/accessed bits are not supported, so no need to update them */
- if (!PT_GUEST_DIRTY_MASK)
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
return 0;
for (level = walker->max_level; level >= walker->level; --level) {
@@ -232,6 +226,10 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
if (level == walker->level && write_fault &&
!(pte & PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+#if PTTYPE == PTTYPE_EPT
+ if (kvm_arch_write_log_dirty(vcpu))
+ return -EINVAL;
+#endif
pte |= PT_GUEST_DIRTY_MASK;
}
if (pte == orig_pte)
@@ -285,9 +283,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
- unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
+ u64 pt_access, pte_access;
+ unsigned index, accessed_dirty, pte_pkey;
+ unsigned nested_access;
gpa_t pte_gpa;
+ bool have_ad;
int offset;
+ u64 walk_nx_mask = 0;
const int write_fault = access & PFERR_WRITE_MASK;
const int user_fault = access & PFERR_USER_MASK;
const int fetch_fault = access & PFERR_FETCH_MASK;
@@ -299,8 +301,10 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
retry_walk:
walker->level = mmu->root_level;
pte = mmu->get_cr3(vcpu);
+ have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
#if PTTYPE == 64
+ walk_nx_mask = 1ULL << PT64_NX_SHIFT;
if (walker->level == PT32E_ROOT_LEVEL) {
pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
@@ -312,15 +316,21 @@ retry_walk:
walker->max_level = walker->level;
ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
- accessed_dirty = PT_GUEST_ACCESSED_MASK;
- pt_access = pte_access = ACC_ALL;
+ /*
+ * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+ * by the MOV to CR instruction are treated as reads and do not cause the
+ * processor to set the dirty flag in any EPT paging-structure entry.
+ */
+ nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
+ pte_access = ~0;
++walker->level;
do {
gfn_t real_gfn;
unsigned long host_addr;
- pt_access &= pte_access;
+ pt_access = pte_access;
--walker->level;
index = PT_INDEX(addr, walker->level);
@@ -332,7 +342,7 @@ retry_walk:
walker->pte_gpa[walker->level - 1] = pte_gpa;
real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
- PFERR_USER_MASK|PFERR_WRITE_MASK,
+ nested_access,
&walker->fault);
/*
@@ -362,6 +372,12 @@ retry_walk:
trace_kvm_mmu_paging_element(pte, walker->level);
+ /*
+ * Inverting the NX it lets us AND it like other
+ * permission bits.
+ */
+ pte_access = pt_access & (pte ^ walk_nx_mask);
+
if (unlikely(!FNAME(is_present_gpte)(pte)))
goto error;
@@ -370,14 +386,16 @@ retry_walk:
goto error;
}
- accessed_dirty &= pte;
- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
- errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
+ accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
+ walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
+ errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
goto error;
@@ -394,7 +412,7 @@ retry_walk:
walker->gfn = real_gpa >> PAGE_SHIFT;
if (!write_fault)
- FNAME(protect_clean_gpte)(&pte_access, pte);
+ FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
else
/*
* On a write fault, fold the dirty bit into accessed_dirty.
@@ -412,10 +430,8 @@ retry_walk:
goto retry_walk;
}
- walker->pt_access = pt_access;
- walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pte_access, pt_access);
+ __func__, (u64)pte, walker->pte_access, walker->pt_access);
return 1;
error:
@@ -443,7 +459,7 @@ error:
*/
if (!(errcode & PFERR_RSVD_MASK)) {
vcpu->arch.exit_qualification &= 0x187;
- vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+ vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
}
#endif
walker->fault.address = addr;
@@ -485,7 +501,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- FNAME(protect_clean_gpte)(&pte_access, gpte);
+ FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
if (is_error_pfn(pfn))
@@ -979,7 +995,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
pte_access &= FNAME(gpte_access)(vcpu, gpte);
- FNAME(protect_clean_gpte)(&pte_access, gpte);
+ FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
&nr_present))
@@ -1025,3 +1041,4 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
#undef PT_GUEST_DIRTY_MASK
#undef PT_GUEST_DIRTY_SHIFT
#undef PT_GUEST_ACCESSED_SHIFT
+#undef PT_HAVE_ACCESSED_DIRTY
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index 9d4a8504a95a..5ab4a364348e 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -294,7 +294,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
((u64)1 << edx.split.bit_width_fixed) - 1;
}
- pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
+ pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d1efe2c62b3f..905ea6052517 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/amd-iommu.h>
#include <linux/hashtable.h>
+#include <linux/frame.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -189,6 +190,7 @@ struct vcpu_svm {
struct nested_state nested;
bool nmi_singlestep;
+ u64 nmi_singlestep_guest_rflags;
unsigned int3_injected;
unsigned long int3_rip;
@@ -741,7 +743,6 @@ static int svm_hardware_enable(void)
struct svm_cpu_data *sd;
uint64_t efer;
- struct desc_ptr gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();
@@ -763,8 +764,7 @@ static int svm_hardware_enable(void)
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
- native_store_gdt(&gdt_descr);
- gdt = (struct desc_struct *)gdt_descr.address;
+ gdt = get_current_gdt_rw();
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
wrmsrl(MSR_EFER, efer | EFER_SVME);
@@ -965,6 +965,18 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
+static void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+ svm->nmi_singlestep = false;
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+ /* Clear our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+ }
+}
+
/* Note:
* This hash table is used to map VM_ID to a struct kvm_arch,
* when handling AMD IOMMU GALOG notification to schedule in
@@ -1198,10 +1210,13 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_CLGI);
set_intercept(svm, INTERCEPT_SKINIT);
set_intercept(svm, INTERCEPT_WBINVD);
- set_intercept(svm, INTERCEPT_MONITOR);
- set_intercept(svm, INTERCEPT_MWAIT);
set_intercept(svm, INTERCEPT_XSETBV);
+ if (!kvm_mwait_in_guest()) {
+ set_intercept(svm, INTERCEPT_MONITOR);
+ set_intercept(svm, INTERCEPT_MWAIT);
+ }
+
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __pa(svm->msrpm);
control->int_ctl = V_INTR_MASKING_MASK;
@@ -1271,7 +1286,8 @@ static void init_vmcb(struct vcpu_svm *svm)
}
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index)
+static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
+ unsigned int index)
{
u64 *avic_physical_id_table;
struct kvm_arch *vm_data = &vcpu->kvm->arch;
@@ -1379,6 +1395,9 @@ static void avic_vm_destroy(struct kvm *kvm)
unsigned long flags;
struct kvm_arch *vm_data = &kvm->arch;
+ if (!avic)
+ return;
+
avic_free_vm_id(vm_data->avic_vm_id);
if (vm_data->avic_logical_id_table_page)
@@ -1707,11 +1726,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
- return to_svm(vcpu)->vmcb->save.rflags;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long rflags = svm->vmcb->save.rflags;
+
+ if (svm->nmi_singlestep) {
+ /* Hide our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ rflags &= ~X86_EFLAGS_RF;
+ }
+ return rflags;
}
static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
+ if (to_svm(vcpu)->nmi_singlestep)
+ rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
/*
* Any change of EFLAGS.VM is accompanied by a reload of SS
* (caused by either a task switch or an inter-privilege IRET),
@@ -1802,7 +1834,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
* AMD's VMCB does not have an explicit unusable field, so emulate it
* for cross vendor migration purposes by "not present"
*/
- var->unusable = !var->present || (var->type == 0);
+ var->unusable = !var->present;
switch (seg) {
case VCPU_SREG_TR:
@@ -1835,6 +1867,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
*/
if (var->unusable)
var->db = 0;
+ /* This is symmetric with svm_set_segment() */
var->dpl = to_svm(vcpu)->vmcb->save.cpl;
break;
}
@@ -1975,18 +2008,14 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
s->base = var->base;
s->limit = var->limit;
s->selector = var->selector;
- if (var->unusable)
- s->attrib = 0;
- else {
- s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
- s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
- s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
- s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
- s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
- s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
- s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
- s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
- }
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
/*
* This is always accurate, except if SYSRET returned to a segment
@@ -1995,7 +2024,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
* would entail passing the CPL to userspace and back.
*/
if (seg == VCPU_SREG_SS)
- svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ /* This is symmetric with svm_get_segment() */
+ svm->vmcb->save.cpl = (var->dpl & 3);
mark_dirty(svm->vmcb, VMCB_SEG);
}
@@ -2108,10 +2138,7 @@ static int db_interception(struct vcpu_svm *svm)
}
if (svm->nmi_singlestep) {
- svm->nmi_singlestep = false;
- if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
- svm->vmcb->save.rflags &=
- ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ disable_nmi_singlestep(svm);
}
if (svm->vcpu.guest_debug &
@@ -2366,8 +2393,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
static int nested_svm_check_permissions(struct vcpu_svm *svm)
{
- if (!(svm->vcpu.arch.efer & EFER_SVME)
- || !is_paging(&svm->vcpu)) {
+ if (!(svm->vcpu.arch.efer & EFER_SVME) ||
+ !is_paging(&svm->vcpu)) {
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
@@ -2377,7 +2404,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
return 1;
}
- return 0;
+ return 0;
}
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
@@ -2530,6 +2557,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
+/* DB exceptions for our internal use must not cause vmexit */
+static int nested_svm_intercept_db(struct vcpu_svm *svm)
+{
+ unsigned long dr6;
+
+ /* if we're not singlestepping, it's not ours */
+ if (!svm->nmi_singlestep)
+ return NESTED_EXIT_DONE;
+
+ /* if it's not a singlestep exception, it's not ours */
+ if (kvm_get_dr(&svm->vcpu, 6, &dr6))
+ return NESTED_EXIT_DONE;
+ if (!(dr6 & DR6_BS))
+ return NESTED_EXIT_DONE;
+
+ /* if the guest is singlestepping, it should get the vmexit */
+ if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
+ disable_nmi_singlestep(svm);
+ return NESTED_EXIT_DONE;
+ }
+
+ /* it's ours, the nested hypervisor must not see this one */
+ return NESTED_EXIT_HOST;
+}
+
static int nested_svm_exit_special(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
@@ -2585,8 +2637,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (svm->nested.intercept_exceptions & excp_bits)
- vmexit = NESTED_EXIT_DONE;
+ if (svm->nested.intercept_exceptions & excp_bits) {
+ if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
+ vmexit = nested_svm_intercept_db(svm);
+ else
+ vmexit = NESTED_EXIT_DONE;
+ }
/* async page fault always cause vmexit */
else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
svm->apf_reason != 0)
@@ -4623,10 +4679,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
== HF_NMI_MASK)
return; /* IRET will cause a vm exit */
+ if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0)
+ return; /* STGI will cause a vm exit */
+
+ if (svm->nested.exit_required)
+ return; /* we're not going to run the guest yet */
+
/*
* Something prevents NMI from been injected. Single step over possible
* problem (IRET or exception injection or interrupt shadow)
*/
+ svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
@@ -4767,6 +4830,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->nested.exit_required))
return;
+ /*
+ * Disable singlestep if we're injecting an interrupt/exception.
+ * We don't want our modified rflags to be pushed on the stack where
+ * we might not be able to easily reset them if we disabled NMI
+ * singlestep later.
+ */
+ if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+ /*
+ * Event injection happens before external interrupts cause a
+ * vmexit and interrupts are disabled here, so smp_send_reschedule
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+ smp_send_reschedule(vcpu->cpu);
+ }
+
pre_svm_run(svm);
sync_lapic_to_cr8(vcpu);
@@ -4903,6 +4982,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
mark_all_clean(svm->vmcb);
}
+STACK_FRAME_NON_STANDARD(svm_vcpu_run);
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
@@ -5253,6 +5333,12 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+ /* [63:9] are reserved. */
+ vcpu->arch.mcg_cap &= 0x1ff;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -5364,6 +5450,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.pmu_ops = &amd_pmu_ops,
.deliver_posted_interrupt = svm_deliver_avic_intr,
.update_pi_irte = svm_update_pi_irte,
+ .setup_mce = svm_setup_mce,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 98e82ee1e699..f76efad248ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -33,6 +33,7 @@
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/hrtimer.h>
+#include <linux/frame.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -48,6 +49,7 @@
#include <asm/kexec.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
+#include <asm/mmu_context.h>
#include "trace.h"
#include "pmu.h"
@@ -84,9 +86,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
static bool __read_mostly emulate_invalid_guest_state = true;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
-static bool __read_mostly vmm_exclusive = 1;
-module_param(vmm_exclusive, bool, S_IRUGO);
-
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
@@ -251,6 +250,7 @@ struct __packed vmcs12 {
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
+ u64 pml_address;
u64 guest_ia32_debugctl;
u64 guest_ia32_pat;
u64 guest_ia32_efer;
@@ -372,6 +372,7 @@ struct __packed vmcs12 {
u16 guest_ldtr_selector;
u16 guest_tr_selector;
u16 guest_intr_status;
+ u16 guest_pml_index;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -410,6 +411,7 @@ struct nested_vmx {
/* Has the level1 guest done vmxon? */
bool vmxon;
gpa_t vmxon_ptr;
+ bool pml_full;
/* The guest-physical address of the current VMCS L1 keeps for L2 */
gpa_t current_vmptr;
@@ -596,6 +598,7 @@ struct vcpu_vmx {
int gs_ldt_reload_needed;
int fs_reload_needed;
u64 msr_host_bndcfgs;
+ unsigned long vmcs_host_cr3; /* May not match real cr3 */
unsigned long vmcs_host_cr4; /* May not match real cr4 */
} host_state;
struct {
@@ -615,10 +618,6 @@ struct vcpu_vmx {
int vpid;
bool emulation_required;
- /* Support for vnmi-less CPUs */
- int soft_vnmi_blocked;
- ktime_t entry_time;
- s64 vnmi_blocked_time;
u32 exit_reason;
/* Posted interrupt descriptor */
@@ -749,6 +748,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
FIELD(GUEST_INTR_STATUS, guest_intr_status),
+ FIELD(GUEST_PML_INDEX, guest_pml_index),
FIELD(HOST_ES_SELECTOR, host_es_selector),
FIELD(HOST_CS_SELECTOR, host_cs_selector),
FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -774,6 +774,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+ FIELD64(PML_ADDRESS, pml_address),
FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
@@ -912,10 +913,9 @@ static void nested_release_page_clean(struct page *page)
kvm_release_page_clean(page);
}
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
-static u64 construct_eptp(unsigned long root_hpa);
-static void kvm_cpu_vmxon(u64 addr);
-static void kvm_cpu_vmxoff(void);
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
static bool vmx_xsaves_supported(void);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
/*
* We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
@@ -1239,6 +1238,11 @@ static inline bool cpu_has_vmx_invvpid_global(void)
return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
}
+static inline bool cpu_has_vmx_invvpid(void)
+{
+ return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
+}
+
static inline bool cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -1285,11 +1289,6 @@ static inline bool cpu_has_vmx_invpcid(void)
SECONDARY_EXEC_ENABLE_INVPCID;
}
-static inline bool cpu_has_virtual_nmis(void)
-{
- return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
-}
-
static inline bool cpu_has_vmx_wbinvd_exit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -1324,6 +1323,11 @@ static inline bool report_flexpriority(void)
return flexpriority_enabled;
}
+static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
+{
+ return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
+}
+
static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
{
return vmcs12->cpu_based_vm_exec_control & bit;
@@ -1358,6 +1362,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
vmx_xsaves_supported();
}
+static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
+}
+
static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -2052,14 +2061,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
*/
static unsigned long segment_base(u16 selector)
{
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
struct desc_struct *table;
unsigned long v;
if (!(selector & ~SEGMENT_RPL_MASK))
return 0;
- table = (struct desc_struct *)gdt->address;
+ table = get_current_gdt_ro();
if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
u16 ldt_selector = kvm_read_ldt();
@@ -2164,7 +2172,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
#endif
if (vmx->host_state.msr_host_bndcfgs)
wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
- load_gdt(this_cpu_ptr(&host_gdt));
+ load_fixmap_gdt(raw_smp_processor_id());
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -2235,15 +2243,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
- if (!vmm_exclusive)
- kvm_cpu_vmxon(phys_addr);
- else if (!already_loaded)
- loaded_vmcs_clear(vmx->loaded_vmcs);
-
if (!already_loaded) {
+ loaded_vmcs_clear(vmx->loaded_vmcs);
local_irq_disable();
crash_disable_local_vmclear(cpu);
@@ -2266,7 +2269,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
if (!already_loaded) {
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+ void *gdt = get_current_gdt_ro();
unsigned long sysenter_esp;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -2277,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
vmcs_writel(HOST_TR_BASE,
(unsigned long)this_cpu_ptr(&cpu_tss));
- vmcs_writel(HOST_GDTR_BASE, gdt->address);
+ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*
* VM exits change the host TR limit to 0x67 after a VM
@@ -2321,11 +2324,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
vmx_vcpu_pi_put(vcpu);
__vmx_load_host_state(to_vmx(vcpu));
- if (!vmm_exclusive) {
- __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
- vcpu->cpu = -1;
- kvm_cpu_vmxoff();
- }
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
@@ -2431,7 +2429,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
if (!(vmcs12->exception_bitmap & (1u << nr)))
return 0;
- nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
return 1;
@@ -2749,11 +2747,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_high);
vmx->nested.nested_vmx_secondary_ctls_low = 0;
vmx->nested.nested_vmx_secondary_ctls_high &=
+ SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED |
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
- SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
@@ -2764,14 +2762,19 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_EPT;
vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
- VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
- VMX_EPT_INVEPT_BIT;
+ VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
if (cpu_has_vmx_ept_execute_only())
vmx->nested.nested_vmx_ept_caps |=
VMX_EPT_EXECUTE_ONLY_BIT;
vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
- VMX_EPT_EXTENT_CONTEXT_BIT;
+ VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
+ VMX_EPT_1GB_PAGE_BIT;
+ if (enable_ept_ad_bits) {
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_PML;
+ vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
+ }
} else
vmx->nested.nested_vmx_ept_caps = 0;
@@ -2781,10 +2784,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* though it is treated as global context. The alternative is
* not failing the single-context invvpid, and it is worse.
*/
- if (enable_vpid)
+ if (enable_vpid) {
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_VPID;
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
VMX_VPID_EXTENT_SUPPORTED_MASK;
- else
+ } else
vmx->nested.nested_vmx_vpid_caps = 0;
if (enable_unrestricted_guest)
@@ -3194,7 +3199,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -3276,7 +3282,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
+ return 1;
+ if (is_noncanonical_address(data & PAGE_MASK) ||
+ (data & MSR_IA32_BNDCFGS_RSVD))
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
@@ -3416,6 +3426,7 @@ static __init int vmx_disabled_by_bios(void)
static void kvm_cpu_vmxon(u64 addr)
{
+ cr4_set_bits(X86_CR4_VMXE);
intel_pt_handle_vmx(1);
asm volatile (ASM_VMX_VMXON_RAX
@@ -3458,14 +3469,8 @@ static int hardware_enable(void)
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
- cr4_set_bits(X86_CR4_VMXE);
-
- if (vmm_exclusive) {
- kvm_cpu_vmxon(phys_addr);
- ept_sync_global();
- }
-
- native_store_gdt(this_cpu_ptr(&host_gdt));
+ kvm_cpu_vmxon(phys_addr);
+ ept_sync_global();
return 0;
}
@@ -3489,15 +3494,13 @@ static void kvm_cpu_vmxoff(void)
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
intel_pt_handle_vmx(0);
+ cr4_clear_bits(X86_CR4_VMXE);
}
static void hardware_disable(void)
{
- if (vmm_exclusive) {
- vmclear_local_loaded_vmcss();
- kvm_cpu_vmxoff();
- }
- cr4_clear_bits(X86_CR4_VMXE);
+ vmclear_local_loaded_vmcss();
+ kvm_cpu_vmxoff();
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -3547,11 +3550,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
- CPU_BASED_MWAIT_EXITING |
- CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING |
CPU_BASED_RDPMC_EXITING;
+ if (!kvm_mwait_in_guest())
+ min |= CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING;
+
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -3617,9 +3622,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_vmexit_control) < 0)
return -EIO;
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
- PIN_BASED_VMX_PREEMPTION_TIMER;
+ min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS;
+ opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
@@ -4011,11 +4016,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
{
- vpid_sync_context(vpid);
if (enable_ept) {
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+ ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
+ } else {
+ vpid_sync_context(vpid);
}
}
@@ -4024,6 +4030,12 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
}
+static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
+{
+ if (enable_ept)
+ vmx_flush_tlb(vcpu);
+}
+
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -4182,14 +4194,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = emulation_required(vcpu);
}
-static u64 construct_eptp(unsigned long root_hpa)
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
{
u64 eptp;
/* TODO write the value reading from MSR */
eptp = VMX_EPT_DEFAULT_MT |
VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
- if (enable_ept_ad_bits)
+ if (enable_ept_ad_bits &&
+ (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
eptp |= VMX_EPT_AD_ENABLE_BIT;
eptp |= (root_hpa & PAGE_MASK);
@@ -4203,7 +4216,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
guest_cr3 = cr3;
if (enable_ept) {
- eptp = construct_eptp(cr3);
+ eptp = construct_eptp(vcpu, cr3);
vmcs_write64(EPT_POINTER, eptp);
if (is_paging(vcpu) || is_guest_mode(vcpu))
guest_cr3 = kvm_read_cr3(vcpu);
@@ -5009,12 +5022,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
u32 low32, high32;
unsigned long tmpl;
struct desc_ptr dt;
- unsigned long cr0, cr4;
+ unsigned long cr0, cr3, cr4;
cr0 = read_cr0();
WARN_ON(cr0 & X86_CR0_TS);
vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
+
+ /*
+ * Save the most likely value for this task's CR3 in the VMCS.
+ * We can't use __get_current_cr3_fast() because we're not atomic.
+ */
+ cr3 = __read_cr3();
+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
+ vmx->host_state.vmcs_host_cr3 = cr3;
/* Save the most likely value for this task's CR4 in the VMCS. */
cr4 = cr4_read_shadow();
@@ -5157,7 +5177,8 @@ static void ept_set_mmio_spte_mask(void)
* EPT Misconfigurations can be generated if the value of bits 2:0
* of an EPT paging-structure entry is 110b (write/execute).
*/
- kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE);
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
+ VMX_EPT_MISCONFIG_WX_VALUE);
}
#define VMX_XSS_EXIT_BITMAP 0
@@ -5285,8 +5306,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx->rmode.vm86_active = 0;
- vmx->soft_vnmi_blocked = 0;
-
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(vcpu, 0);
@@ -5406,8 +5425,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
- if (!cpu_has_virtual_nmis() ||
- vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
enable_irq_window(vcpu);
return;
}
@@ -5448,19 +5466,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (!is_guest_mode(vcpu)) {
- if (!cpu_has_virtual_nmis()) {
- /*
- * Tracking the NMI-blocked state in software is built upon
- * finding the next open IRQ window. This, in turn, depends on
- * well-behaving guests: They have to keep IRQs disabled at
- * least as long as the NMI handler runs. Otherwise we may
- * cause NMI nesting, maybe breaking the guest. But as this is
- * highly unlikely, we can live with the residual risk.
- */
- vmx->soft_vnmi_blocked = 1;
- vmx->vnmi_blocked_time = 0;
- }
-
++vcpu->stat.nmi_injections;
vmx->nmi_known_unmasked = false;
}
@@ -5477,8 +5482,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
- if (!cpu_has_virtual_nmis())
- return to_vmx(vcpu)->soft_vnmi_blocked;
if (to_vmx(vcpu)->nmi_known_unmasked)
return false;
return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
@@ -5488,20 +5491,13 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!cpu_has_virtual_nmis()) {
- if (vmx->soft_vnmi_blocked != masked) {
- vmx->soft_vnmi_blocked = masked;
- vmx->vnmi_blocked_time = 0;
- }
- } else {
- vmx->nmi_known_unmasked = !masked;
- if (masked)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- }
+ vmx->nmi_known_unmasked = !masked;
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -5509,9 +5505,6 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
if (to_vmx(vcpu)->nested.nested_run_pending)
return 0;
- if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
- return 0;
-
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
| GUEST_INTR_STATE_NMI));
@@ -6232,23 +6225,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
unsigned long exit_qualification;
gpa_t gpa;
u32 error_code;
- int gla_validity;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- gla_validity = (exit_qualification >> 7) & 0x3;
- if (gla_validity == 0x2) {
- printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
- printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
- (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
- vmcs_readl(GUEST_LINEAR_ADDRESS));
- printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
- (long unsigned int)exit_qualification);
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
- return 0;
- }
-
/*
* EPT violation happened while executing iret from NMI,
* "blocked by NMI" bit has to be set before next VM entry.
@@ -6256,7 +6235,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
* AAK134, BY25.
*/
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
- cpu_has_virtual_nmis() &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
@@ -6342,7 +6320,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (intr_window_requested && vmx_interrupt_allowed(vcpu))
return handle_interrupt_window(&vmx->vcpu);
- if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+ if (kvm_test_request(KVM_REQ_EVENT, vcpu))
return 1;
err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
@@ -6472,7 +6450,7 @@ void vmx_enable_tdp(void)
enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
0ull, VMX_EPT_EXECUTABLE_MASK,
cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
- enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK);
+ VMX_EPT_RWX_MASK);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
@@ -6517,8 +6495,10 @@ static __init int hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
- if (!cpu_has_vmx_vpid())
+ if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
+ !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
enable_vpid = 0;
+
if (!cpu_has_vmx_shadow_vmcs())
enable_shadow_vmcs = 0;
if (enable_shadow_vmcs)
@@ -6531,7 +6511,7 @@ static __init int hardware_setup(void)
enable_ept_ad_bits = 0;
}
- if (!cpu_has_vmx_ept_ad_bits())
+ if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
if (!cpu_has_vmx_unrestricted_guest())
@@ -6574,7 +6554,6 @@ static __init int hardware_setup(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
vmx_msr_bitmap_legacy, PAGE_SIZE);
@@ -6941,97 +6920,21 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
return 0;
}
-/*
- * This function performs the various checks including
- * - if it's 4KB aligned
- * - No bits beyond the physical address width are set
- * - Returns 0 on success or else 1
- * (Intel SDM Section 30.3)
- */
-static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
- gpa_t *vmpointer)
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
{
gva_t gva;
- gpa_t vmptr;
struct x86_exception e;
- struct page *page;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
- sizeof(vmptr), &e)) {
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
+ sizeof(*vmpointer), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
- switch (exit_reason) {
- case EXIT_REASON_VMON:
- /*
- * SDM 3: 24.11.5
- * The first 4 bytes of VMXON region contain the supported
- * VMCS revision identifier
- *
- * Note - IA32_VMX_BASIC[48] will never be 1
- * for the nested case;
- * which replaces physical address width with 32
- *
- */
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
-
- page = nested_get_page(vcpu, vmptr);
- if (page == NULL) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
- if (*(u32 *)kmap(page) != VMCS12_REVISION) {
- kunmap(page);
- nested_release_page_clean(page);
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
- kunmap(page);
- nested_release_page_clean(page);
- vmx->nested.vmxon_ptr = vmptr;
- break;
- case EXIT_REASON_VMCLEAR:
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_INVALID_ADDRESS);
- return kvm_skip_emulated_instruction(vcpu);
- }
-
- if (vmptr == vmx->nested.vmxon_ptr) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_VMXON_POINTER);
- return kvm_skip_emulated_instruction(vcpu);
- }
- break;
- case EXIT_REASON_VMPTRLD:
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_INVALID_ADDRESS);
- return kvm_skip_emulated_instruction(vcpu);
- }
-
- if (vmptr == vmx->nested.vmxon_ptr) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_VMXON_POINTER);
- return kvm_skip_emulated_instruction(vcpu);
- }
- break;
- default:
- return 1; /* shouldn't happen */
- }
-
- if (vmpointer)
- *vmpointer = vmptr;
return 0;
}
@@ -7093,34 +6996,26 @@ out_msr_bitmap:
static int handle_vmon(struct kvm_vcpu *vcpu)
{
int ret;
- struct kvm_segment cs;
+ gpa_t vmptr;
+ struct page *page;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- /* The Intel VMX Instruction Reference lists a bunch of bits that
- * are prerequisite to running VMXON, most notably cr4.VMXE must be
- * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
- * Otherwise, we should fail with #UD. We test these now:
+ /*
+ * The Intel VMX Instruction Reference lists a bunch of bits that are
+ * prerequisite to running VMXON, most notably cr4.VMXE must be set to
+ * 1 (see vmx_set_cr4() for when we allow the guest to set this).
+ * Otherwise, we should fail with #UD. But most faulting conditions
+ * have already been checked by hardware, prior to the VM-exit for
+ * VMXON. We do test guest cr4.VMXE because processor CR4 always has
+ * that bit set to 1 in non-root mode.
*/
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
- !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
- (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
-
- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
- if (is_long_mode(vcpu) && !cs.l) {
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (vmx_get_cpl(vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
if (vmx->nested.vmxon) {
nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
return kvm_skip_emulated_instruction(vcpu);
@@ -7132,9 +7027,37 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
-
+
+ /*
+ * SDM 3: 24.11.5
+ * The first 4 bytes of VMXON region contain the supported
+ * VMCS revision identifier
+ *
+ * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
+ * which replaces physical address width with 32
+ */
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ page = nested_get_page(vcpu, vmptr);
+ if (page == NULL) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ if (*(u32 *)kmap(page) != VMCS12_REVISION) {
+ kunmap(page);
+ nested_release_page_clean(page);
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ kunmap(page);
+ nested_release_page_clean(page);
+
+ vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
if (ret)
return ret;
@@ -7147,29 +7070,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
* Intel's VMX Instruction Reference specifies a common set of prerequisites
* for running VMX instructions (except VMXON, whose prerequisites are
* slightly different). It also specifies what exception to inject otherwise.
+ * Note that many of these exceptions have priority over VM exits, so they
+ * don't have to be checked again here.
*/
static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
{
- struct kvm_segment cs;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!vmx->nested.vmxon) {
+ if (!to_vmx(vcpu)->nested.vmxon) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 0;
}
-
- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
- if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
- (is_long_mode(vcpu) && !cs.l)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 0;
- }
-
- if (vmx_get_cpl(vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return 0;
- }
-
return 1;
}
@@ -7264,9 +7173,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vmx);
@@ -7513,7 +7432,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, true, &gva))
return 1;
- /* _system ok, as nested_vmx_check_permission verified cpl=0 */
+ /* _system ok, as hardware has verified cpl=0 */
kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
&field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
}
@@ -7596,9 +7515,19 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
if (vmx->nested.current_vmptr != vmptr) {
struct vmcs12 *new_vmcs12;
struct page *page;
@@ -7646,7 +7575,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, true, &vmcs_gva))
return 1;
- /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
+ /* ok to use *_system, as hardware has verified cpl=0 */
if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
(void *)&to_vmx(vcpu)->nested.current_vmptr,
sizeof(u64), &e)) {
@@ -7679,11 +7608,6 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
-
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
@@ -7733,7 +7657,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
unsigned long type, types;
gva_t gva;
struct x86_exception e;
- int vpid;
+ struct {
+ u64 vpid;
+ u64 gla;
+ } operand;
if (!(vmx->nested.nested_vmx_secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
@@ -7763,17 +7690,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
vmx_instruction_info, false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
- sizeof(u32), &e)) {
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+ sizeof(operand), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
+ if (operand.vpid >> 16) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
+ if (is_noncanonical_address(operand.gla)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ /* fall through */
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
- if (!vpid) {
+ if (!operand.vpid) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
return kvm_skip_emulated_instruction(vcpu);
@@ -7805,7 +7743,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
* "blocked by NMI" bit has to be set before next VM entry.
*/
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
- cpu_has_virtual_nmis() &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
@@ -7970,11 +7907,13 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
int cr = exit_qualification & 15;
- int reg = (exit_qualification >> 8) & 15;
- unsigned long val = kvm_register_readl(vcpu, reg);
+ int reg;
+ unsigned long val;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
+ reg = (exit_qualification >> 8) & 15;
+ val = kvm_register_readl(vcpu, reg);
switch (cr) {
case 0:
if (vmcs12->cr0_guest_host_mask &
@@ -8029,6 +7968,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
* lmsw can change bits 1..3 of cr0, and only set bit 0 of
* cr0. Other attempted changes are ignored, with no exit.
*/
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
if (vmcs12->cr0_guest_host_mask & 0xe &
(val ^ vmcs12->cr0_read_shadow))
return true;
@@ -8107,6 +8047,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
case EXIT_REASON_RDPMC:
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+ case EXIT_REASON_RDRAND:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+ case EXIT_REASON_RDSEED:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@ -8184,6 +8128,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
case EXIT_REASON_PREEMPTION_TIMER:
return false;
+ case EXIT_REASON_PML_FULL:
+ /* We emulate PML support to L1. */
+ return false;
default:
return true;
}
@@ -8477,31 +8424,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
return 0;
}
- if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
- !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
- get_vmcs12(vcpu))))) {
- if (vmx_interrupt_allowed(vcpu)) {
- vmx->soft_vnmi_blocked = 0;
- } else if (vmx->vnmi_blocked_time > 1000000000LL &&
- vcpu->arch.nmi_pending) {
- /*
- * This CPU don't support us in finding the end of an
- * NMI-blocked window if the guest runs with IRQs
- * disabled. So we pull the trigger after 1 s of
- * futile waiting, but inform the user about this.
- */
- printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
- "state on VCPU %d after 1 s timeout\n",
- __func__, vcpu->vcpu_id);
- vmx->soft_vnmi_blocked = 0;
- }
- }
-
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu);
else {
- WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+ exit_reason);
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -8547,6 +8475,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
} else {
sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmx_flush_tlb_ept_only(vcpu);
}
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
@@ -8572,8 +8501,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
*/
if (!is_guest_mode(vcpu) ||
!nested_cpu_has2(get_vmcs12(&vmx->vcpu),
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
vmcs_write64(APIC_ACCESS_ADDR, hpa);
+ vmx_flush_tlb_ept_only(vcpu);
+ }
}
static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
@@ -8741,6 +8672,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
);
}
}
+STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
static bool vmx_has_high_real_mode_segbase(void)
{
@@ -8768,37 +8700,33 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
- if (cpu_has_virtual_nmis()) {
- if (vmx->nmi_known_unmasked)
- return;
- /*
- * Can't use vmx->exit_intr_info since we're not sure what
- * the exit reason is.
- */
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
- vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
- /*
- * SDM 3: 27.7.1.2 (September 2008)
- * Re-set bit "block by NMI" before VM entry if vmexit caused by
- * a guest IRET fault.
- * SDM 3: 23.2.2 (September 2008)
- * Bit 12 is undefined in any of the following cases:
- * If the VM exit sets the valid bit in the IDT-vectoring
- * information field.
- * If the VM exit is due to a double fault.
- */
- if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
- vector != DF_VECTOR && !idtv_info_valid)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmx->nmi_known_unmasked =
- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
- & GUEST_INTR_STATE_NMI);
- } else if (unlikely(vmx->soft_vnmi_blocked))
- vmx->vnmi_blocked_time +=
- ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+ if (vmx->nmi_known_unmasked)
+ return;
+ /*
+ * Can't use vmx->exit_intr_info since we're not sure what
+ * the exit reason is.
+ */
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
}
static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
@@ -8913,11 +8841,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long debugctlmsr, cr4;
-
- /* Record the guest's net vcpu time for enforced NMI injections. */
- if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
- vmx->entry_time = ktime_get();
+ unsigned long debugctlmsr, cr3, cr4;
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
@@ -8939,6 +8863,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->host_state.vmcs_host_cr3 = cr3;
+ }
+
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -9125,17 +9055,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
+STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
-static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int cpu;
- if (vmx->loaded_vmcs == &vmx->vmcs01)
+ if (vmx->loaded_vmcs == vmcs)
return;
cpu = get_cpu();
- vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx->loaded_vmcs = vmcs;
vmx_vcpu_put(vcpu);
vmx_vcpu_load(vcpu, cpu);
vcpu->cpu = cpu;
@@ -9153,7 +9084,7 @@ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
r = vcpu_load(vcpu);
BUG_ON(r);
- vmx_load_vmcs01(vcpu);
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
free_nested(vmx);
vcpu_put(vcpu);
}
@@ -9214,11 +9145,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->loaded_vmcs->shadow_vmcs = NULL;
if (!vmx->loaded_vmcs->vmcs)
goto free_msrs;
- if (!vmm_exclusive)
- kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
loaded_vmcs_init(vmx->loaded_vmcs);
- if (!vmm_exclusive)
- kvm_cpu_vmxoff();
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -9460,16 +9387,28 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exit_reason;
+ unsigned long exit_qualification = vcpu->arch.exit_qualification;
- if (fault->error_code & PFERR_RSVD_MASK)
+ if (vmx->nested.pml_full) {
+ exit_reason = EXIT_REASON_PML_FULL;
+ vmx->nested.pml_full = false;
+ exit_qualification &= INTR_INFO_UNBLOCK_NMI;
+ } else if (fault->error_code & PFERR_RSVD_MASK)
exit_reason = EXIT_REASON_EPT_MISCONFIG;
else
exit_reason = EXIT_REASON_EPT_VIOLATION;
- nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification);
+
+ nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
vmcs12->guest_physical_address = fault->address;
}
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
+{
+ return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT;
+}
+
/* Callbacks for nested_ept_init_mmu_context: */
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
@@ -9478,17 +9417,26 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
return get_vmcs12(vcpu)->ept_pointer;
}
-static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
+ bool wants_ad;
+
WARN_ON(mmu_is_nested(vcpu));
+ wants_ad = nested_ept_ad_enabled(vcpu);
+ if (wants_ad && !enable_ept_ad_bits)
+ return 1;
+
+ kvm_mmu_unload(vcpu);
kvm_init_shadow_ept_mmu(vcpu,
to_vmx(vcpu)->nested.nested_vmx_ept_caps &
- VMX_EPT_EXECUTE_ONLY_BIT);
+ VMX_EPT_EXECUTE_ONLY_BIT,
+ wants_ad);
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+ return 0;
}
static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -9800,6 +9748,22 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u64 address = vmcs12->pml_address;
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !IS_ALIGNED(address, 4096) ||
+ address >> maxphyaddr)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
struct vmx_msr_entry *e)
{
@@ -9973,8 +9937,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
bool from_vmentry, u32 *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exec_control;
- bool nested_ept_enabled = false;
+ u32 exec_control, vmcs12_exec_ctrl;
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -10105,8 +10068,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT);
if (nested_cpu_has(vmcs12,
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
- exec_control |= vmcs12->secondary_vm_exec_control;
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
+ vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
+ ~SECONDARY_EXEC_ENABLE_PML;
+ exec_control |= vmcs12_exec_ctrl;
+ }
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
vmcs_write64(EOI_EXIT_BITMAP0,
@@ -10121,8 +10087,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_intr_status);
}
- nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
-
/*
* Write an illegal value to APIC_ACCESS_ADDR. Later,
* nested_get_vmcs12_pages will either fix it up or
@@ -10252,9 +10216,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
+ if (enable_pml) {
+ /*
+ * Conceptually we want to copy the PML address and index from
+ * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+ * since we always flush the log on each vmexit, this happens
+ * to be equivalent to simply resetting the fields in vmcs02.
+ */
+ ASSERT(vmx->pml_pg);
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
if (nested_cpu_has_ept(vmcs12)) {
- kvm_mmu_unload(vcpu);
- nested_ept_init_mmu_context(vcpu);
+ if (nested_ept_init_mmu_context(vcpu)) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+ } else if (nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ vmx_flush_tlb_ept_only(vcpu);
}
/*
@@ -10282,12 +10263,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx_set_efer(vcpu, vcpu->arch.efer);
/* Shadow page tables on either EPT or shadow page tables. */
- if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled,
+ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
entry_failure_code))
return 1;
- kvm_mmu_reset_context(vcpu);
-
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
@@ -10323,12 +10302,16 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_vmx_check_pml_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
vmx->nested.nested_vmx_procbased_ctls_low,
vmx->nested.nested_vmx_procbased_ctls_high) ||
- !vmx_control_verify(vmcs12->secondary_vm_exec_control,
- vmx->nested.nested_vmx_secondary_ctls_low,
- vmx->nested.nested_vmx_secondary_ctls_high) ||
+ (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ !vmx_control_verify(vmcs12->secondary_vm_exec_control,
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high)) ||
!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
vmx->nested.nested_vmx_pinbased_ctls_low,
vmx->nested.nested_vmx_pinbased_ctls_high) ||
@@ -10340,6 +10323,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx->nested.nested_vmx_entry_ctls_high))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
!nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
!nested_cr3_valid(vcpu, vmcs12->host_cr3))
@@ -10407,7 +10393,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct loaded_vmcs *vmcs02;
- int cpu;
u32 msr_entry_idx;
u32 exit_qual;
@@ -10420,18 +10405,12 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
- cpu = get_cpu();
- vmx->loaded_vmcs = vmcs02;
- vmx_vcpu_put(vcpu);
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- put_cpu();
-
+ vmx_switch_vmcs(vcpu, vmcs02);
vmx_segment_cache_clear(vmx);
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
leave_guest_mode(vcpu);
- vmx_load_vmcs01(vcpu);
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, exit_qual);
return 1;
@@ -10444,7 +10423,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
vmcs12->vm_entry_msr_load_count);
if (msr_entry_idx) {
leave_guest_mode(vcpu);
- vmx_load_vmcs01(vcpu);
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
return 1;
@@ -10764,8 +10743,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
}
- if (nested_cpu_has_ept(vmcs12))
- vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
if (nested_cpu_has_vid(vmcs12))
vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
@@ -10790,8 +10768,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
if (kvm_mpx_supported())
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
- if (nested_cpu_has_xsaves(vmcs12))
- vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
}
/*
@@ -11012,7 +10988,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
if (unlikely(vmx->fail))
vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
- vmx_load_vmcs01(vcpu);
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
&& nested_exit_intr_ack_set(vcpu)) {
@@ -11056,6 +11032,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
vmx_set_virtual_x2apic_mode(vcpu,
vcpu->arch.apic_base & X2APIC_ENABLE);
+ } else if (!nested_cpu_has_ept(vmcs12) &&
+ nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ vmx_flush_tlb_ept_only(vcpu);
}
/* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -11184,7 +11164,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
vmx->hv_deadline_tsc = tscl + delta_tsc;
vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
PIN_BASED_VMX_PREEMPTION_TIMER);
- return 0;
+
+ return delta_tsc == 0;
}
static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
@@ -11220,6 +11201,46 @@ static void vmx_flush_log_dirty(struct kvm *kvm)
kvm_flush_pml_buffers(kvm);
}
+static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t gpa;
+ struct page *page = NULL;
+ u64 *pml_address;
+
+ if (is_guest_mode(vcpu)) {
+ WARN_ON_ONCE(vmx->nested.pml_full);
+
+ /*
+ * Check if PML is enabled for the nested guest.
+ * Whether eptp bit 6 is set is already checked
+ * as part of A/D emulation.
+ */
+ vmcs12 = get_vmcs12(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ vmx->nested.pml_full = true;
+ return 1;
+ }
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+
+ page = nested_get_page(vcpu, vmcs12->pml_address);
+ if (!page)
+ return 0;
+
+ pml_address = kmap(page);
+ pml_address[vmcs12->guest_pml_index--] = gpa;
+ kunmap(page);
+ nested_release_page_clean(page);
+ }
+
+ return 0;
+}
+
static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *memslot,
gfn_t offset, unsigned long mask)
@@ -11579,6 +11600,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .write_log_dirty = vmx_write_pml_buffer,
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1faf620a6fdc..6c7266f7766d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,7 +27,6 @@
#include "kvm_cache_regs.h"
#include "x86.h"
#include "cpuid.h"
-#include "assigned-dev.h"
#include "pmu.h"
#include "hyperv.h"
@@ -1008,6 +1007,8 @@ static u32 emulated_msrs[] = {
MSR_IA32_MCG_CTL,
MSR_IA32_MCG_EXT_CTL,
MSR_IA32_SMBASE,
+ MSR_PLATFORM_INFO,
+ MSR_MISC_FEATURES_ENABLES,
};
static unsigned num_emulated_msrs;
@@ -1444,10 +1445,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
- s64 usdiff;
bool matched;
bool already_matched;
u64 data = msr->data;
+ bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
@@ -1455,51 +1456,34 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
- int faulted = 0;
-
- /* n.b - signed multiplication and division required */
- usdiff = data - kvm->arch.last_tsc_write;
-#ifdef CONFIG_X86_64
- usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
-#else
- /* do_div() only does unsigned */
- asm("1: idivl %[divisor]\n"
- "2: xor %%edx, %%edx\n"
- " movl $0, %[faulted]\n"
- "3:\n"
- ".section .fixup,\"ax\"\n"
- "4: movl $1, %[faulted]\n"
- " jmp 3b\n"
- ".previous\n"
-
- _ASM_EXTABLE(1b, 4b)
-
- : "=A"(usdiff), [faulted] "=r" (faulted)
- : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
-
-#endif
- do_div(elapsed, 1000);
- usdiff -= elapsed;
- if (usdiff < 0)
- usdiff = -usdiff;
-
- /* idivl overflow => difference is larger than USEC_PER_SEC */
- if (faulted)
- usdiff = USEC_PER_SEC;
- } else
- usdiff = USEC_PER_SEC; /* disable TSC match window below */
+ if (data == 0 && msr->host_initiated) {
+ /*
+ * detection of vcpu initialization -- need to sync
+ * with other vCPUs. This particularly helps to keep
+ * kvm_clock stable after CPU hotplug
+ */
+ synchronizing = true;
+ } else {
+ u64 tsc_exp = kvm->arch.last_tsc_write +
+ nsec_to_cycles(vcpu, elapsed);
+ u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+ /*
+ * Special case: TSC write with a small delta (1 second)
+ * of virtual cycle time against real time is
+ * interpreted as an attempt to synchronize the CPU.
+ */
+ synchronizing = data < tsc_exp + tsc_hz &&
+ data + tsc_hz > tsc_exp;
+ }
+ }
/*
- * Special case: TSC write with a small delta (1 second) of virtual
- * cycle time against real time is interpreted as an attempt to
- * synchronize the CPU.
- *
* For a reliable TSC, we can match TSC offsets, and for an unstable
* TSC, we add elapsed time in this computation. We could let the
* compensation code attempt to catch up if we fall behind, but
* it's better to try to match offsets from the beginning.
*/
- if (usdiff < USEC_PER_SEC &&
+ if (synchronizing &&
vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
if (!check_tsc_unstable()) {
offset = kvm->arch.cur_tsc_offset;
@@ -1769,16 +1753,17 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
- clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+ kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
spin_unlock(&ka->pvclock_gtod_sync_lock);
#endif
}
-static u64 __get_kvmclock_ns(struct kvm *kvm)
+u64 get_kvmclock_ns(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
+ u64 ret;
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
@@ -1790,22 +1775,17 @@ static u64 __get_kvmclock_ns(struct kvm *kvm)
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
spin_unlock(&ka->pvclock_gtod_sync_lock);
+ /* both __this_cpu_read() and rdtsc() should be on the same cpu */
+ get_cpu();
+
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
- return __pvclock_read_cycles(&hv_clock, rdtsc());
-}
+ ret = __pvclock_read_cycles(&hv_clock, rdtsc());
-u64 get_kvmclock_ns(struct kvm *kvm)
-{
- unsigned long flags;
- s64 ns;
-
- local_irq_save(flags);
- ns = __get_kvmclock_ns(kvm);
- local_irq_restore(flags);
+ put_cpu();
- return ns;
+ return ret;
}
static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
@@ -1813,7 +1793,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info guest_hv_clock;
- if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
&guest_hv_clock, sizeof(guest_hv_clock))))
return;
@@ -1834,9 +1814,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
smp_wmb();
@@ -1850,16 +1830,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
- kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
smp_wmb();
vcpu->hv_clock.version++;
- kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2092,7 +2072,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
- if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
sizeof(u32)))
return 1;
@@ -2111,7 +2091,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
+ if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
return;
@@ -2122,7 +2102,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
vcpu->arch.st.steal.version += 1;
- kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
smp_wmb();
@@ -2131,14 +2111,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
- kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
smp_wmb();
vcpu->arch.st.steal.version += 1;
- kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
}
@@ -2155,6 +2135,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_VM_HSAVE_PA:
case MSR_AMD64_PATCH_LOADER:
case MSR_AMD64_BU_CFG2:
+ case MSR_AMD64_DC_CFG:
break;
case MSR_EFER:
@@ -2230,8 +2211,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
if (ka->boot_vcpu_runs_old_kvmclock != tmp)
- set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
- &vcpu->requests);
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
ka->boot_vcpu_runs_old_kvmclock = tmp;
}
@@ -2243,7 +2223,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & 1))
break;
- if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
vcpu->arch.pv_time_enabled = false;
@@ -2264,7 +2244,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
- if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
data & KVM_STEAL_VALID_BITS,
sizeof(struct kvm_steal_time)))
return 1;
@@ -2331,6 +2311,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.osvw.status = data;
break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated ||
+ data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
+ (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
+ cpuid_fault_enabled(vcpu)))
+ return 1;
+ vcpu->arch.msr_platform_info = data;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
+ (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ !supports_cpuid_fault(vcpu)))
+ return 1;
+ vcpu->arch.msr_misc_features_enables = data;
+ break;
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
@@ -2417,6 +2412,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_FAM10H_MMIO_CONF_BASE:
case MSR_AMD64_BU_CFG2:
case MSR_IA32_PERF_CTL:
+ case MSR_AMD64_DC_CFG:
msr_info->data = 0;
break;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
@@ -2545,6 +2541,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
msr_info->data = vcpu->arch.osvw.status;
break;
+ case MSR_PLATFORM_INFO:
+ msr_info->data = vcpu->arch.msr_platform_info;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ msr_info->data = vcpu->arch.msr_misc_features_enables;
+ break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -2675,15 +2677,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
- case KVM_CAP_ASSIGN_DEV_IRQ:
- case KVM_CAP_PCI_2_3:
-#endif
r = 1;
break;
case KVM_CAP_ADJUST_CLOCK:
r = KVM_CLOCK_TSC_STABLE;
break;
+ case KVM_CAP_X86_GUEST_MWAIT:
+ r = kvm_mwait_in_guest();
+ break;
case KVM_CAP_X86_SMM:
/* SMBASE is usually relocated above 1M on modern chipsets,
* and SMM handlers might indeed rely on 4G segment limits,
@@ -2695,9 +2696,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
*/
r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
break;
- case KVM_CAP_COALESCED_MMIO:
- r = KVM_COALESCED_MMIO_PAGE_OFFSET;
- break;
case KVM_CAP_VAPIC:
r = !kvm_x86_ops->cpu_has_accelerated_tpr();
break;
@@ -2713,11 +2711,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
break;
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
- case KVM_CAP_IOMMU:
- r = iommu_present(&pci_bus_type);
- break;
-#endif
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
break;
@@ -2816,11 +2809,6 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
-static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
-{
- set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
-}
-
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
/* Address WBINVD may be executed by guest */
@@ -2853,10 +2841,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
- if (kvm_lapic_hv_timer_in_use(vcpu) &&
- kvm_x86_ops->set_hv_timer(vcpu,
- kvm_get_lapic_target_expiration_tsc(vcpu)))
- kvm_lapic_switch_to_sw_timer(vcpu);
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_restart_hv_timer(vcpu);
+
/*
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
@@ -2864,7 +2852,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != cpu)
- kvm_migrate_timers(vcpu);
+ kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
vcpu->cpu = cpu;
}
@@ -2878,7 +2866,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
vcpu->arch.st.steal.preempted = 1;
- kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
+ kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal.preempted,
offsetof(struct kvm_steal_time, preempted),
sizeof(vcpu->arch.st.steal.preempted));
@@ -3124,7 +3112,14 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return -EINVAL;
if (events->exception.injected &&
- (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
+ is_guest_mode(vcpu)))
+ return -EINVAL;
+
+ /* INITs are latched while in SMM */
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
+ (events->smi.smm || events->smi.pending) &&
+ vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
return -EINVAL;
process_nmi(vcpu);
@@ -3301,11 +3296,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
}
}
+#define XSAVE_MXCSR_OFFSET 24
+
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
u64 xstate_bv =
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
/*
@@ -3313,11 +3311,13 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
- if (xstate_bv & ~kvm_supported_xcr0())
+ if (xstate_bv & ~kvm_supported_xcr0() ||
+ mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
- if (xstate_bv & ~XFEATURE_MASK_FPSSE)
+ if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
+ mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state.fxsave,
guest_xsave->region, sizeof(struct fxregs_state));
@@ -3721,22 +3721,21 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
+ struct kvm_pic *pic = kvm->arch.vpic;
int r;
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&chip->chip.pic,
- &pic_irqchip(kvm)->pics[0],
+ memcpy(&chip->chip.pic, &pic->pics[0],
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&chip->chip.pic,
- &pic_irqchip(kvm)->pics[1],
+ memcpy(&chip->chip.pic, &pic->pics[1],
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_IOAPIC:
- r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
@@ -3747,32 +3746,31 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
+ struct kvm_pic *pic = kvm->arch.vpic;
int r;
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic_irqchip(kvm)->lock);
- memcpy(&pic_irqchip(kvm)->pics[0],
- &chip->chip.pic,
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ spin_unlock(&pic->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic_irqchip(kvm)->lock);
- memcpy(&pic_irqchip(kvm)->pics[1],
- &chip->chip.pic,
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ spin_unlock(&pic->lock);
break;
case KVM_IRQCHIP_IOAPIC:
- r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
break;
}
- kvm_pic_update_irq(pic_irqchip(kvm));
+ kvm_pic_update_irq(pic);
return r;
}
@@ -4018,20 +4016,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_ioapic_init(kvm);
if (r) {
- mutex_lock(&kvm->slots_lock);
kvm_pic_destroy(kvm);
- mutex_unlock(&kvm->slots_lock);
goto create_irqchip_unlock;
}
r = kvm_setup_default_irq_routing(kvm);
if (r) {
- mutex_lock(&kvm->slots_lock);
- mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
kvm_pic_destroy(kvm);
- mutex_unlock(&kvm->irq_lock);
- mutex_unlock(&kvm->slots_lock);
goto create_irqchip_unlock;
}
/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
@@ -4196,10 +4188,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
r = 0;
- local_irq_disable();
- now_ns = __get_kvmclock_ns(kvm);
+ now_ns = get_kvmclock_ns(kvm);
kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
- local_irq_enable();
kvm_gen_update_masterclock(kvm);
break;
}
@@ -4207,11 +4197,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
struct kvm_clock_data user_ns;
u64 now_ns;
- local_irq_disable();
- now_ns = __get_kvmclock_ns(kvm);
+ now_ns = get_kvmclock_ns(kvm);
user_ns.clock = now_ns;
user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
- local_irq_enable();
memset(&user_ns.pad, 0, sizeof(user_ns.pad));
r = -EFAULT;
@@ -4230,7 +4218,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
break;
}
default:
- r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
+ r = -ENOTTY;
}
out:
return r;
@@ -4843,16 +4831,20 @@ emul_write:
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
{
- /* TODO: String I/O for in kernel device */
- int r;
+ int r = 0, i;
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
- else
- r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
+ for (i = 0; i < vcpu->arch.pio.count; i++) {
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
+ if (r)
+ break;
+ pd += vcpu->arch.pio.size;
+ }
return r;
}
@@ -4890,6 +4882,8 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
if (vcpu->arch.pio.count)
goto data_avail;
+ memset(vcpu->arch.pio_data, 0, size * count);
+
ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
if (ret) {
data_avail:
@@ -5073,6 +5067,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
if (var.unusable) {
memset(desc, 0, sizeof(*desc));
+ if (base3)
+ *base3 = 0;
return false;
}
@@ -5223,6 +5219,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
}
+static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+{
+ return emul_to_vcpu(ctxt)->arch.hflags;
+}
+
+static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+{
+ kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5262,6 +5268,8 @@ static const struct x86_emulate_ops emulate_ops = {
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
.set_nmi_mask = emulator_set_nmi_mask,
+ .get_hflags = emulator_get_hflags,
+ .set_hflags = emulator_set_hflags,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -5305,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
@@ -5314,7 +5324,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
- ctxt->emul_flags = vcpu->arch.hflags;
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5521,36 +5530,25 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * rflags is the old, "raw" value of the flags. The new value has
- * not been saved yet.
- *
- * This is correct even for TF set by the guest, because "the
- * processor will not generate this exception after the instruction
- * that sets the TF flag".
- */
- if (unlikely(rflags & X86_EFLAGS_TF)) {
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
- DR6_RTM;
- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = EMULATE_USER_EXIT;
- } else {
- /*
- * "Certain debug exceptions may clear bit 0-3. The
- * remaining contents of the DR6 register are never
- * cleared by the processor".
- */
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
- }
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
}
}
@@ -5560,7 +5558,17 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
int r = EMULATE_DONE;
kvm_x86_ops->skip_emulated_instruction(vcpu);
- kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+
+ /*
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ if (unlikely(rflags & X86_EFLAGS_TF))
+ kvm_vcpu_do_singlestep(vcpu, &r);
return r == EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
@@ -5718,11 +5726,10 @@ restart:
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- if (vcpu->arch.hflags != ctxt->emul_flags)
- kvm_set_hflags(vcpu, ctxt->emul_flags);
kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE)
- kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+ if (r == EMULATE_DONE &&
+ (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP)
__kvm_set_rflags(vcpu, ctxt->eflags);
@@ -6004,7 +6011,7 @@ static void kvm_set_mmio_spte_mask(void)
mask &= ~1ull;
#endif
- kvm_mmu_set_mmio_spte_mask(mask);
+ kvm_mmu_set_mmio_spte_mask(mask, mask);
}
#ifdef CONFIG_X86_64
@@ -6726,7 +6733,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
- if (vcpu->requests) {
+ if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -6869,7 +6876,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* 1) We should set ->mode before checking ->requests. Please see
- * the comment in kvm_make_all_cpus_request.
+ * the comment in kvm_vcpu_exiting_guest_mode().
*
* 2) For APICv, we should set ->mode before checking PIR.ON. This
* pairs with the memory barrier implicit in pi_test_and_set_on
@@ -6890,7 +6897,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->sync_pir_to_irr(vcpu);
}
- if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
+ if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
|| need_resched() || signal_pending(current)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@@ -7051,7 +7058,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (r <= 0)
break;
- clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
@@ -7179,7 +7186,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
kvm_vcpu_block(vcpu);
kvm_apic_accept_events(vcpu);
- clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+ kvm_clear_request(KVM_REQ_UNHALT, vcpu);
r = -EAGAIN;
goto out;
}
@@ -7355,6 +7362,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
return -EINVAL;
+ /* INITs are latched while in SMM */
+ if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+ (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+ mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
+ return -EINVAL;
+
if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
@@ -7724,6 +7737,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!init_event) {
kvm_pmu_reset(vcpu);
vcpu->arch.smbase = 0x30000;
+
+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+ vcpu->arch.msr_misc_features_enables = 0;
}
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
@@ -8068,7 +8084,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
{
cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
- kvm_free_all_assigned_devices(kvm);
kvm_free_pit(kvm);
}
@@ -8152,12 +8167,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
if (kvm_x86_ops->vm_destroy)
kvm_x86_ops->vm_destroy(kvm);
- kvm_iommu_unmap_guest(kvm);
- kfree(kvm->arch.vpic);
- kfree(kvm->arch.vioapic);
+ kvm_pic_destroy(kvm);
+ kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kvm_mmu_uninit_vm(kvm);
+ kvm_page_track_cleanup(kvm);
}
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@ -8198,13 +8213,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
slot->base_gfn, level) + 1;
slot->arch.rmap[i] =
- kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+ kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL);
if (!slot->arch.rmap[i])
goto out_free;
if (i == 0)
continue;
- linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+ linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL);
if (!linfo)
goto out_free;
@@ -8381,10 +8396,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (vcpu->arch.pv.pv_unhalted)
return true;
- if (atomic_read(&vcpu->arch.nmi_queued))
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ kvm_x86_ops->nmi_allowed(vcpu)))
return true;
- if (test_bit(KVM_REQ_SMI, &vcpu->requests))
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending && !is_smm(vcpu)))
return true;
if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -8535,8 +8553,9 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
{
- return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
- sizeof(val));
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+ sizeof(val));
}
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
@@ -8566,11 +8585,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
{
struct x86_exception fault;
- trace_kvm_async_pf_ready(work->arch.token, work->gva);
if (work->wakeup_all)
work->arch.token = ~0; /* broadcast wakeup */
else
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+ trace_kvm_async_pf_ready(work->arch.token, work->gva);
if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
@@ -8590,8 +8609,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
return true;
else
- return !kvm_event_needs_reinjection(vcpu) &&
- kvm_x86_ops->interrupt_allowed(vcpu);
+ return kvm_can_do_async_pf(vcpu);
}
void kvm_arch_start_assignment(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e8ff3e4ce38a..612067074905 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -1,6 +1,8 @@
#ifndef ARCH_X86_KVM_X86_H
#define ARCH_X86_KVM_X86_H
+#include <asm/processor.h>
+#include <asm/mwait.h>
#include <linux/kvm_host.h>
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
@@ -212,4 +214,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
__rem; \
})
+static inline bool kvm_mwait_in_guest(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT))
+ return false;
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ /* All AMD CPUs have a working MWAIT implementation */
+ return true;
+ case X86_VENDOR_INTEL:
+ /* Handle Intel below */
+ break;
+ default:
+ return false;
+ }
+
+ /*
+ * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as
+ * they would allow guest to stop the CPU completely by disabling
+ * interrupts then invoking MWAIT.
+ */
+ if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return false;
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+
+ if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK))
+ return false;
+
+ return true;
+}
+
#endif