summaryrefslogtreecommitdiff
path: root/virt
diff options
context:
space:
mode:
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/arm/pmu.c48
-rw-r--r--virt/kvm/kvm_main.c238
2 files changed, 230 insertions, 56 deletions
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 362a01886bab..8731dfeced8b 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -8,6 +8,7 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
+#include <linux/perf/arm_pmu.h>
#include <linux/uaccess.h>
#include <asm/kvm_emulate.h>
#include <kvm/arm_pmu.h>
@@ -146,8 +147,7 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
if (kvm_pmu_pmc_is_chained(pmc) &&
kvm_pmu_idx_is_high_counter(select_idx))
counter = upper_32_bits(counter);
-
- else if (!kvm_pmu_idx_is_64bit(vcpu, select_idx))
+ else if (select_idx != ARMV8_PMU_CYCLE_IDX)
counter = lower_32_bits(counter);
return counter;
@@ -193,7 +193,7 @@ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
*/
static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
{
- u64 counter, reg;
+ u64 counter, reg, val;
pmc = kvm_pmu_get_canonical_pmc(pmc);
if (!pmc->perf_event)
@@ -201,16 +201,19 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
- if (kvm_pmu_pmc_is_chained(pmc)) {
- reg = PMEVCNTR0_EL0 + pmc->idx;
- __vcpu_sys_reg(vcpu, reg) = lower_32_bits(counter);
- __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter);
+ if (pmc->idx == ARMV8_PMU_CYCLE_IDX) {
+ reg = PMCCNTR_EL0;
+ val = counter;
} else {
- reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
- ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
- __vcpu_sys_reg(vcpu, reg) = lower_32_bits(counter);
+ reg = PMEVCNTR0_EL0 + pmc->idx;
+ val = lower_32_bits(counter);
}
+ __vcpu_sys_reg(vcpu, reg) = val;
+
+ if (kvm_pmu_pmc_is_chained(pmc))
+ __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter);
+
kvm_pmu_release_perf_event(pmc);
}
@@ -440,8 +443,25 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
struct pt_regs *regs)
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu);
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
int idx = pmc->idx;
+ u64 period;
+
+ cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE);
+
+ /*
+ * Reset the sample period to the architectural limit,
+ * i.e. the point where the counter overflows.
+ */
+ period = -(local64_read(&perf_event->count));
+
+ if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
+ period &= GENMASK(31, 0);
+
+ local64_set(&perf_event->hw.period_left, 0);
+ perf_event->attr.sample_period = period;
+ perf_event->hw.sample_period = period;
__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
@@ -449,6 +469,8 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
kvm_vcpu_kick(vcpu);
}
+
+ cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD);
}
/**
@@ -567,12 +589,12 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
* high counter.
*/
attr.sample_period = (-counter) & GENMASK(63, 0);
+ if (kvm_pmu_counter_is_enabled(vcpu, pmc->idx + 1))
+ attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED;
+
event = perf_event_create_kernel_counter(&attr, -1, current,
kvm_pmu_perf_overflow,
pmc + 1);
-
- if (kvm_pmu_counter_is_enabled(vcpu, pmc->idx + 1))
- attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED;
} else {
/* The initial sample period (overflow count) of an event. */
if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 66a977472a1c..fef50464d56a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -50,6 +50,7 @@
#include <linux/bsearch.h>
#include <linux/io.h>
#include <linux/lockdep.h>
+#include <linux/kthread.h>
#include <asm/processor.h>
#include <asm/ioctl.h>
@@ -121,9 +122,22 @@ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
#define KVM_COMPAT(c) .compat_ioctl = (c)
#else
+/*
+ * For architectures that don't implement a compat infrastructure,
+ * adopt a double line of defense:
+ * - Prevent a compat task from opening /dev/kvm
+ * - If the open has been done by a 64bit task, and the KVM fd
+ * passed to a compat task, let the ioctls fail.
+ */
static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg) { return -EINVAL; }
-#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl
+
+static int kvm_no_compat_open(struct inode *inode, struct file *file)
+{
+ return is_compat_task() ? -ENODEV : 0;
+}
+#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
+ .open = kvm_no_compat_open
#endif
static int hardware_enable_all(void);
static void hardware_disable_all(void);
@@ -149,10 +163,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
return 0;
}
+bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+{
+ /*
+ * The metadata used by is_zone_device_page() to determine whether or
+ * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
+ * the device has been pinned, e.g. by get_user_pages(). WARN if the
+ * page_count() is zero to help detect bad usage of this helper.
+ */
+ if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+ return false;
+
+ return is_zone_device_page(pfn_to_page(pfn));
+}
+
bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
{
+ /*
+ * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
+ * perspective they are "normal" pages, albeit with slightly different
+ * usage rules.
+ */
if (pfn_valid(pfn))
- return PageReserved(pfn_to_page(pfn));
+ return PageReserved(pfn_to_page(pfn)) &&
+ !kvm_is_zone_device_pfn(pfn);
return true;
}
@@ -625,10 +659,28 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
return 0;
}
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
static struct kvm *kvm_create_vm(unsigned long type)
{
- int r, i;
struct kvm *kvm = kvm_arch_alloc_vm();
+ int r = -ENOMEM;
+ int i;
if (!kvm)
return ERR_PTR(-ENOMEM);
@@ -640,46 +692,51 @@ static struct kvm *kvm_create_vm(unsigned long type)
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
- refcount_set(&kvm->users_count, 1);
INIT_LIST_HEAD(&kvm->devices);
- r = kvm_arch_init_vm(kvm, type);
- if (r)
- goto out_err_no_disable;
-
- r = hardware_enable_all();
- if (r)
- goto out_err_no_disable;
-
-#ifdef CONFIG_HAVE_KVM_IRQFD
- INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
-#endif
-
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
- r = -ENOMEM;
+ if (init_srcu_struct(&kvm->srcu))
+ goto out_err_no_srcu;
+ if (init_srcu_struct(&kvm->irq_srcu))
+ goto out_err_no_irq_srcu;
+
+ refcount_set(&kvm->users_count, 1);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
struct kvm_memslots *slots = kvm_alloc_memslots();
+
if (!slots)
- goto out_err_no_srcu;
+ goto out_err_no_arch_destroy_vm;
/* Generations must be different for each address space. */
slots->generation = i;
rcu_assign_pointer(kvm->memslots[i], slots);
}
- if (init_srcu_struct(&kvm->srcu))
- goto out_err_no_srcu;
- if (init_srcu_struct(&kvm->irq_srcu))
- goto out_err_no_irq_srcu;
for (i = 0; i < KVM_NR_BUSES; i++) {
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
if (!kvm->buses[i])
- goto out_err;
+ goto out_err_no_arch_destroy_vm;
}
+ r = kvm_arch_init_vm(kvm, type);
+ if (r)
+ goto out_err_no_arch_destroy_vm;
+
+ r = hardware_enable_all();
+ if (r)
+ goto out_err_no_disable;
+
+#ifdef CONFIG_HAVE_KVM_IRQFD
+ INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
+#endif
+
r = kvm_init_mmu_notifier(kvm);
if (r)
+ goto out_err_no_mmu_notifier;
+
+ r = kvm_arch_post_init_vm(kvm);
+ if (r)
goto out_err;
mutex_lock(&kvm_lock);
@@ -691,17 +748,24 @@ static struct kvm *kvm_create_vm(unsigned long type)
return kvm;
out_err:
- cleanup_srcu_struct(&kvm->irq_srcu);
-out_err_no_irq_srcu:
- cleanup_srcu_struct(&kvm->srcu);
-out_err_no_srcu:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+ if (kvm->mmu_notifier.ops)
+ mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
hardware_disable_all();
out_err_no_disable:
- refcount_set(&kvm->users_count, 0);
+ kvm_arch_destroy_vm(kvm);
+out_err_no_arch_destroy_vm:
+ WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm_get_bus(kvm, i));
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
+ cleanup_srcu_struct(&kvm->irq_srcu);
+out_err_no_irq_srcu:
+ cleanup_srcu_struct(&kvm->srcu);
+out_err_no_srcu:
kvm_arch_free_vm(kvm);
mmdrop(current->mm);
return ERR_PTR(r);
@@ -733,6 +797,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
mutex_lock(&kvm_lock);
list_del(&kvm->vm_list);
mutex_unlock(&kvm_lock);
+ kvm_arch_pre_destroy_vm(kvm);
+
kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++) {
struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -1853,7 +1919,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
void kvm_set_pfn_dirty(kvm_pfn_t pfn)
{
- if (!kvm_is_reserved_pfn(pfn)) {
+ if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
struct page *page = pfn_to_page(pfn);
SetPageDirty(page);
@@ -1863,7 +1929,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
void kvm_set_pfn_accessed(kvm_pfn_t pfn)
{
- if (!kvm_is_reserved_pfn(pfn))
+ if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
mark_page_accessed(pfn_to_page(pfn));
}
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
@@ -2360,20 +2426,23 @@ out:
kvm_arch_vcpu_unblocking(vcpu);
block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
- if (!vcpu_valid_wakeup(vcpu))
- shrink_halt_poll_ns(vcpu);
- else if (halt_poll_ns) {
- if (block_ns <= vcpu->halt_poll_ns)
- ;
- /* we had a long block, shrink polling */
- else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+ if (!kvm_arch_no_poll(vcpu)) {
+ if (!vcpu_valid_wakeup(vcpu)) {
shrink_halt_poll_ns(vcpu);
- /* we had a short halt and our poll time is too small */
- else if (vcpu->halt_poll_ns < halt_poll_ns &&
- block_ns < halt_poll_ns)
- grow_halt_poll_ns(vcpu);
- } else
- vcpu->halt_poll_ns = 0;
+ } else if (halt_poll_ns) {
+ if (block_ns <= vcpu->halt_poll_ns)
+ ;
+ /* we had a long block, shrink polling */
+ else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+ shrink_halt_poll_ns(vcpu);
+ /* we had a short halt and our poll time is too small */
+ else if (vcpu->halt_poll_ns < halt_poll_ns &&
+ block_ns < halt_poll_ns)
+ grow_halt_poll_ns(vcpu);
+ } else {
+ vcpu->halt_poll_ns = 0;
+ }
+ }
trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
kvm_arch_vcpu_block_finish(vcpu);
@@ -4364,3 +4433,86 @@ void kvm_exit(void)
kvm_vfio_ops_exit();
}
EXPORT_SYMBOL_GPL(kvm_exit);
+
+struct kvm_vm_worker_thread_context {
+ struct kvm *kvm;
+ struct task_struct *parent;
+ struct completion init_done;
+ kvm_vm_thread_fn_t thread_fn;
+ uintptr_t data;
+ int err;
+};
+
+static int kvm_vm_worker_thread(void *context)
+{
+ /*
+ * The init_context is allocated on the stack of the parent thread, so
+ * we have to locally copy anything that is needed beyond initialization
+ */
+ struct kvm_vm_worker_thread_context *init_context = context;
+ struct kvm *kvm = init_context->kvm;
+ kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+ uintptr_t data = init_context->data;
+ int err;
+
+ err = kthread_park(current);
+ /* kthread_park(current) is never supposed to return an error */
+ WARN_ON(err != 0);
+ if (err)
+ goto init_complete;
+
+ err = cgroup_attach_task_all(init_context->parent, current);
+ if (err) {
+ kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+ __func__, err);
+ goto init_complete;
+ }
+
+ set_user_nice(current, task_nice(init_context->parent));
+
+init_complete:
+ init_context->err = err;
+ complete(&init_context->init_done);
+ init_context = NULL;
+
+ if (err)
+ return err;
+
+ /* Wait to be woken up by the spawner before proceeding. */
+ kthread_parkme();
+
+ if (!kthread_should_stop())
+ err = thread_fn(kvm, data);
+
+ return err;
+}
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+ uintptr_t data, const char *name,
+ struct task_struct **thread_ptr)
+{
+ struct kvm_vm_worker_thread_context init_context = {};
+ struct task_struct *thread;
+
+ *thread_ptr = NULL;
+ init_context.kvm = kvm;
+ init_context.parent = current;
+ init_context.thread_fn = thread_fn;
+ init_context.data = data;
+ init_completion(&init_context.init_done);
+
+ thread = kthread_run(kvm_vm_worker_thread, &init_context,
+ "%s-%d", name, task_pid_nr(current));
+ if (IS_ERR(thread))
+ return PTR_ERR(thread);
+
+ /* kthread_run is never supposed to return NULL */
+ WARN_ON(thread == NULL);
+
+ wait_for_completion(&init_context.init_done);
+
+ if (!init_context.err)
+ *thread_ptr = thread;
+
+ return init_context.err;
+}