summaryrefslogtreecommitdiff
path: root/virt/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 15:40:51 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 15:40:51 -0700
commit36824f198c621cebeb22966b5e244378fa341295 (patch)
treeee1e358a4ed0cd022ae12b4b7ba1fa3d0e5746d5 /virt/kvm
parent9840cfcb97fc8b6aa7b36cec3cc3fd763f14052e (diff)
parentb8917b4ae44d1b945f6fba3d8ee6777edb44633b (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "This covers all architectures (except MIPS) so I don't expect any other feature pull requests this merge window. ARM: - Add MTE support in guests, complete with tag save/restore interface - Reduce the impact of CMOs by moving them in the page-table code - Allow device block mappings at stage-2 - Reduce the footprint of the vmemmap in protected mode - Support the vGIC on dumb systems such as the Apple M1 - Add selftest infrastructure to support multiple configuration and apply that to PMU/non-PMU setups - Add selftests for the debug architecture - The usual crop of PMU fixes PPC: - Support for the H_RPT_INVALIDATE hypercall - Conversion of Book3S entry/exit to C - Bug fixes S390: - new HW facilities for guests - make inline assembly more robust with KASAN and co x86: - Allow userspace to handle emulation errors (unknown instructions) - Lazy allocation of the rmap (host physical -> guest physical address) - Support for virtualizing TSC scaling on VMX machines - Optimizations to avoid shattering huge pages at the beginning of live migration - Support for initializing the PDPTRs without loading them from memory - Many TLB flushing cleanups - Refuse to load if two-stage paging is available but NX is not (this has been a requirement in practice for over a year) - A large series that separates the MMU mode (WP/SMAP/SMEP etc.) from CR0/CR4/EFER, using the MMU mode everywhere once it is computed from the CPU registers - Use PM notifier to notify the guest about host suspend or hibernate - Support for passing arguments to Hyper-V hypercalls using XMM registers - Support for Hyper-V TLB flush hypercalls and enlightened MSR bitmap on AMD processors - Hide Hyper-V hypercalls that are not included in the guest CPUID - Fixes for live migration of virtual machines that use the Hyper-V "enlightened VMCS" optimization of nested virtualization - Bugfixes (not many) Generic: - Support for retrieving statistics without debugfs - Cleanups for the KVM selftests API" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (314 commits) KVM: x86: rename apic_access_page_done to apic_access_memslot_enabled kvm: x86: disable the narrow guest module parameter on unload selftests: kvm: Allows userspace to handle emulation errors. kvm: x86: Allow userspace to handle emulation errors KVM: x86/mmu: Let guest use GBPAGES if supported in hardware and TDP is on KVM: x86/mmu: Get CR4.SMEP from MMU, not vCPU, in shadow page fault KVM: x86/mmu: Get CR0.WP from MMU, not vCPU, in shadow page fault KVM: x86/mmu: Drop redundant rsvd bits reset for nested NPT KVM: x86/mmu: Optimize and clean up so called "last nonleaf level" logic KVM: x86: Enhance comments for MMU roles and nested transition trickiness KVM: x86/mmu: WARN on any reserved SPTE value when making a valid SPTE KVM: x86/mmu: Add helpers to do full reserved SPTE checks w/ generic MMU KVM: x86/mmu: Use MMU's role to determine PTTYPE KVM: x86/mmu: Collapse 32-bit PAE and 64-bit statements for helpers KVM: x86/mmu: Add a helper to calculate root from role_regs KVM: x86/mmu: Add helper to update paging metadata KVM: x86/mmu: Don't update nested guest's paging bitmasks if CR0.PG=0 KVM: x86/mmu: Consolidate reset_rsvds_bits_mask() calls KVM: x86/mmu: Use MMU role_regs to get LA57, and drop vCPU LA57 helper KVM: x86/mmu: Get nested MMU's root level from the MMU's role ...
Diffstat (limited to 'virt/kvm')
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/binary_stats.c146
-rw-r--r--virt/kvm/kvm_main.c323
3 files changed, 423 insertions, 49 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 1c37ccd5d402..62b39149b8c8 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -63,3 +63,6 @@ config HAVE_KVM_NO_POLL
config KVM_XFER_TO_GUEST_WORK
bool
+
+config HAVE_KVM_PM_NOTIFIER
+ bool
diff --git a/virt/kvm/binary_stats.c b/virt/kvm/binary_stats.c
new file mode 100644
index 000000000000..e609d428811a
--- /dev/null
+++ b/virt/kvm/binary_stats.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM binary statistics interface implementation
+ *
+ * Copyright 2021 Google LLC
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+
+/**
+ * kvm_stats_read() - Common function to read from the binary statistics
+ * file descriptor.
+ *
+ * @id: identification string of the stats
+ * @header: stats header for a vm or a vcpu
+ * @desc: start address of an array of stats descriptors for a vm or a vcpu
+ * @stats: start address of stats data block for a vm or a vcpu
+ * @size_stats: the size of stats data block pointed by @stats
+ * @user_buffer: start address of userspace buffer
+ * @size: requested read size from userspace
+ * @offset: the start position from which the content will be read for the
+ * corresponding vm or vcp file descriptor
+ *
+ * The file content of a vm/vcpu file descriptor is now defined as below:
+ * +-------------+
+ * | Header |
+ * +-------------+
+ * | id string |
+ * +-------------+
+ * | Descriptors |
+ * +-------------+
+ * | Stats Data |
+ * +-------------+
+ * Although this function allows userspace to read any amount of data (as long
+ * as in the limit) from any position, the typical usage would follow below
+ * steps:
+ * 1. Read header from offset 0. Get the offset of descriptors and stats data
+ * and some other necessary information. This is a one-time work for the
+ * lifecycle of the corresponding vm/vcpu stats fd.
+ * 2. Read id string from its offset. This is a one-time work for the lifecycle
+ * of the corresponding vm/vcpu stats fd.
+ * 3. Read descriptors from its offset and discover all the stats by parsing
+ * descriptors. This is a one-time work for the lifecycle of the
+ * corresponding vm/vcpu stats fd.
+ * 4. Periodically read stats data from its offset using pread.
+ *
+ * Return: the number of bytes that has been successfully read
+ */
+ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
+ const struct _kvm_stats_desc *desc,
+ void *stats, size_t size_stats,
+ char __user *user_buffer, size_t size, loff_t *offset)
+{
+ ssize_t len;
+ ssize_t copylen;
+ ssize_t remain = size;
+ size_t size_desc;
+ size_t size_header;
+ void *src;
+ loff_t pos = *offset;
+ char __user *dest = user_buffer;
+
+ size_header = sizeof(*header);
+ size_desc = header->num_desc * sizeof(*desc);
+
+ len = KVM_STATS_NAME_SIZE + size_header + size_desc + size_stats - pos;
+ len = min(len, remain);
+ if (len <= 0)
+ return 0;
+ remain = len;
+
+ /*
+ * Copy kvm stats header.
+ * The header is the first block of content userspace usually read out.
+ * The pos is 0 and the copylen and remain would be the size of header.
+ * The copy of the header would be skipped if offset is larger than the
+ * size of header. That usually happens when userspace reads stats
+ * descriptors and stats data.
+ */
+ copylen = size_header - pos;
+ copylen = min(copylen, remain);
+ if (copylen > 0) {
+ src = (void *)header + pos;
+ if (copy_to_user(dest, src, copylen))
+ return -EFAULT;
+ remain -= copylen;
+ pos += copylen;
+ dest += copylen;
+ }
+
+ /*
+ * Copy kvm stats header id string.
+ * The id string is unique for every vm/vcpu, which is stored in kvm
+ * and kvm_vcpu structure.
+ * The id string is part of the stat header from the perspective of
+ * userspace, it is usually read out together with previous constant
+ * header part and could be skipped for later descriptors and stats
+ * data readings.
+ */
+ copylen = header->id_offset + KVM_STATS_NAME_SIZE - pos;
+ copylen = min(copylen, remain);
+ if (copylen > 0) {
+ src = id + pos - header->id_offset;
+ if (copy_to_user(dest, src, copylen))
+ return -EFAULT;
+ remain -= copylen;
+ pos += copylen;
+ dest += copylen;
+ }
+
+ /*
+ * Copy kvm stats descriptors.
+ * The descriptors copy would be skipped in the typical case that
+ * userspace periodically read stats data, since the pos would be
+ * greater than the end address of descriptors
+ * (header->header.desc_offset + size_desc) causing copylen <= 0.
+ */
+ copylen = header->desc_offset + size_desc - pos;
+ copylen = min(copylen, remain);
+ if (copylen > 0) {
+ src = (void *)desc + pos - header->desc_offset;
+ if (copy_to_user(dest, src, copylen))
+ return -EFAULT;
+ remain -= copylen;
+ pos += copylen;
+ dest += copylen;
+ }
+
+ /* Copy kvm stats values */
+ copylen = header->data_offset + size_stats - pos;
+ copylen = min(copylen, remain);
+ if (copylen > 0) {
+ src = stats + pos - header->data_offset;
+ if (copy_to_user(dest, src, copylen))
+ return -EFAULT;
+ remain -= copylen;
+ pos += copylen;
+ dest += copylen;
+ }
+
+ *offset = pos;
+ return len;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b68678b52129..6866c1780cf5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,6 +51,7 @@
#include <linux/io.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
+#include <linux/suspend.h>
#include <asm/processor.h>
#include <asm/ioctl.h>
@@ -114,7 +115,6 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
struct dentry *kvm_debugfs_dir;
EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
-static int kvm_debugfs_num_entries;
static const struct file_operations stat_fops_per_vm;
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
@@ -331,7 +331,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
*/
if (!kvm_arch_flush_remote_tlb(kvm)
|| kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
- ++kvm->stat.remote_tlb_flush;
+ ++kvm->stat.generic.remote_tlb_flush;
cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
@@ -780,6 +780,38 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_pm_notifier_call(struct notifier_block *bl,
+ unsigned long state,
+ void *unused)
+{
+ struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
+
+ return kvm_arch_pm_notifier(kvm, state);
+}
+
+static void kvm_init_pm_notifier(struct kvm *kvm)
+{
+ kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
+ /* Suspend KVM before we suspend ftrace, RCU, etc. */
+ kvm->pm_notifier.priority = INT_MAX;
+ register_pm_notifier(&kvm->pm_notifier);
+}
+
+static void kvm_destroy_pm_notifier(struct kvm *kvm)
+{
+ unregister_pm_notifier(&kvm->pm_notifier);
+}
+#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
+static void kvm_init_pm_notifier(struct kvm *kvm)
+{
+}
+
+static void kvm_destroy_pm_notifier(struct kvm *kvm)
+{
+}
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+
static struct kvm_memslots *kvm_alloc_memslots(void)
{
int i;
@@ -827,9 +859,24 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
kvfree(slots);
}
+static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
+{
+ switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
+ case KVM_STATS_TYPE_INSTANT:
+ return 0444;
+ case KVM_STATS_TYPE_CUMULATIVE:
+ case KVM_STATS_TYPE_PEAK:
+ default:
+ return 0644;
+ }
+}
+
+
static void kvm_destroy_vm_debugfs(struct kvm *kvm)
{
int i;
+ int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+ kvm_vcpu_stats_header.num_desc;
if (!kvm->debugfs_dentry)
return;
@@ -847,7 +894,10 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
{
char dir_name[ITOA_MAX_LEN * 2];
struct kvm_stat_data *stat_data;
- struct kvm_stats_debugfs_item *p;
+ const struct _kvm_stats_desc *pdesc;
+ int i;
+ int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+ kvm_vcpu_stats_header.num_desc;
if (!debugfs_initialized())
return 0;
@@ -861,15 +911,32 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
if (!kvm->debugfs_stat_data)
return -ENOMEM;
- for (p = debugfs_entries; p->name; p++) {
+ for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
+ pdesc = &kvm_vm_stats_desc[i];
+ stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
+ if (!stat_data)
+ return -ENOMEM;
+
+ stat_data->kvm = kvm;
+ stat_data->desc = pdesc;
+ stat_data->kind = KVM_STAT_VM;
+ kvm->debugfs_stat_data[i] = stat_data;
+ debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+ kvm->debugfs_dentry, stat_data,
+ &stat_fops_per_vm);
+ }
+
+ for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
+ pdesc = &kvm_vcpu_stats_desc[i];
stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
if (!stat_data)
return -ENOMEM;
stat_data->kvm = kvm;
- stat_data->dbgfs_item = p;
- kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
- debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
+ stat_data->desc = pdesc;
+ stat_data->kind = KVM_STAT_VCPU;
+ kvm->debugfs_stat_data[i] = stat_data;
+ debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
kvm->debugfs_dentry, stat_data,
&stat_fops_per_vm);
}
@@ -909,6 +976,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
+ mutex_init(&kvm->slots_arch_lock);
INIT_LIST_HEAD(&kvm->devices);
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -963,6 +1031,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
mutex_unlock(&kvm_lock);
preempt_notifier_inc();
+ kvm_init_pm_notifier(kvm);
return kvm;
@@ -1010,6 +1079,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
int i;
struct mm_struct *mm = kvm->mm;
+ kvm_destroy_pm_notifier(kvm);
kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
kvm_destroy_vm_debugfs(kvm);
kvm_arch_sync_events(kvm);
@@ -1281,6 +1351,14 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
rcu_assign_pointer(kvm->memslots[as_id], slots);
+
+ /*
+ * Acquired in kvm_set_memslot. Must be released before synchronize
+ * SRCU below in order to avoid deadlock with another thread
+ * acquiring the slots_arch_lock in an srcu critical section.
+ */
+ mutex_unlock(&kvm->slots_arch_lock);
+
synchronize_srcu_expedited(&kvm->srcu);
/*
@@ -1307,6 +1385,18 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
return old_memslots;
}
+static size_t kvm_memslots_size(int slots)
+{
+ return sizeof(struct kvm_memslots) +
+ (sizeof(struct kvm_memory_slot) * slots);
+}
+
+static void kvm_copy_memslots(struct kvm_memslots *to,
+ struct kvm_memslots *from)
+{
+ memcpy(to, from, kvm_memslots_size(from->used_slots));
+}
+
/*
* Note, at a minimum, the current number of used slots must be allocated, even
* when deleting a memslot, as we need a complete duplicate of the memslots for
@@ -1316,19 +1406,16 @@ static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
enum kvm_mr_change change)
{
struct kvm_memslots *slots;
- size_t old_size, new_size;
-
- old_size = sizeof(struct kvm_memslots) +
- (sizeof(struct kvm_memory_slot) * old->used_slots);
+ size_t new_size;
if (change == KVM_MR_CREATE)
- new_size = old_size + sizeof(struct kvm_memory_slot);
+ new_size = kvm_memslots_size(old->used_slots + 1);
else
- new_size = old_size;
+ new_size = kvm_memslots_size(old->used_slots);
slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
if (likely(slots))
- memcpy(slots, old, old_size);
+ kvm_copy_memslots(slots, old);
return slots;
}
@@ -1343,9 +1430,27 @@ static int kvm_set_memslot(struct kvm *kvm,
struct kvm_memslots *slots;
int r;
+ /*
+ * Released in install_new_memslots.
+ *
+ * Must be held from before the current memslots are copied until
+ * after the new memslots are installed with rcu_assign_pointer,
+ * then released before the synchronize srcu in install_new_memslots.
+ *
+ * When modifying memslots outside of the slots_lock, must be held
+ * before reading the pointer to the current memslots until after all
+ * changes to those memslots are complete.
+ *
+ * These rules ensure that installing new memslots does not lose
+ * changes made to the previous memslots.
+ */
+ mutex_lock(&kvm->slots_arch_lock);
+
slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
- if (!slots)
+ if (!slots) {
+ mutex_unlock(&kvm->slots_arch_lock);
return -ENOMEM;
+ }
if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
/*
@@ -1356,10 +1461,9 @@ static int kvm_set_memslot(struct kvm *kvm,
slot->flags |= KVM_MEMSLOT_INVALID;
/*
- * We can re-use the old memslots, the only difference from the
- * newly installed memslots is the invalid flag, which will get
- * dropped by update_memslots anyway. We'll also revert to the
- * old memslots if preparing the new memory region fails.
+ * We can re-use the memory from the old memslots.
+ * It will be overwritten with a copy of the new memslots
+ * after reacquiring the slots_arch_lock below.
*/
slots = install_new_memslots(kvm, as_id, slots);
@@ -1371,6 +1475,17 @@ static int kvm_set_memslot(struct kvm *kvm,
* - kvm_is_visible_gfn (mmu_check_root)
*/
kvm_arch_flush_shadow_memslot(kvm, slot);
+
+ /* Released in install_new_memslots. */
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * The arch-specific fields of the memslots could have changed
+ * between releasing the slots_arch_lock in
+ * install_new_memslots and here, so get a fresh copy of the
+ * slots.
+ */
+ kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
}
r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
@@ -1386,8 +1501,13 @@ static int kvm_set_memslot(struct kvm *kvm,
return 0;
out_slots:
- if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
+ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
+ slot = id_to_memslot(slots, old->id);
+ slot->flags &= ~KVM_MEMSLOT_INVALID;
slots = install_new_memslots(kvm, as_id, slots);
+ } else {
+ mutex_unlock(&kvm->slots_arch_lock);
+ }
kvfree(slots);
return r;
}
@@ -2958,9 +3078,9 @@ static inline void
update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
{
if (waited)
- vcpu->stat.halt_poll_fail_ns += poll_ns;
+ vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
else
- vcpu->stat.halt_poll_success_ns += poll_ns;
+ vcpu->stat.generic.halt_poll_success_ns += poll_ns;
}
/*
@@ -2978,16 +3098,16 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
- ++vcpu->stat.halt_attempted_poll;
+ ++vcpu->stat.generic.halt_attempted_poll;
do {
/*
* This sets KVM_REQ_UNHALT if an interrupt
* arrives.
*/
if (kvm_vcpu_check_block(vcpu) < 0) {
- ++vcpu->stat.halt_successful_poll;
+ ++vcpu->stat.generic.halt_successful_poll;
if (!vcpu_valid_wakeup(vcpu))
- ++vcpu->stat.halt_poll_invalid;
+ ++vcpu->stat.generic.halt_poll_invalid;
goto out;
}
poll_end = cur = ktime_get();
@@ -3044,7 +3164,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
waitp = kvm_arch_vcpu_get_wait(vcpu);
if (rcuwait_wake_up(waitp)) {
WRITE_ONCE(vcpu->ready, true);
- ++vcpu->stat.halt_wakeup;
+ ++vcpu->stat.generic.halt_wakeup;
return true;
}
@@ -3377,6 +3497,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
+ /* Fill the stats id string for the vcpu */
+ snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
+ task_pid_nr(current), id);
+
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
r = create_vcpu_fd(vcpu);
@@ -3426,6 +3550,44 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
return 0;
}
+static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
+ size_t size, loff_t *offset)
+{
+ struct kvm_vcpu *vcpu = file->private_data;
+
+ return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
+ &kvm_vcpu_stats_desc[0], &vcpu->stat,
+ sizeof(vcpu->stat), user_buffer, size, offset);
+}
+
+static const struct file_operations kvm_vcpu_stats_fops = {
+ .read = kvm_vcpu_stats_read,
+ .llseek = noop_llseek,
+};
+
+static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
+{
+ int fd;
+ struct file *file;
+ char name[15 + ITOA_MAX_LEN + 1];
+
+ snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ return PTR_ERR(file);
+ }
+ file->f_mode |= FMODE_PREAD;
+ fd_install(fd, file);
+
+ return fd;
+}
+
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -3623,6 +3785,10 @@ out_free1:
r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
break;
}
+ case KVM_GET_STATS_FD: {
+ r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
+ break;
+ }
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
@@ -3881,6 +4047,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
#else
return 0;
#endif
+ case KVM_CAP_BINARY_STATS_FD:
+ return 1;
default:
break;
}
@@ -3984,6 +4152,42 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
}
}
+static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
+ size_t size, loff_t *offset)
+{
+ struct kvm *kvm = file->private_data;
+
+ return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
+ &kvm_vm_stats_desc[0], &kvm->stat,
+ sizeof(kvm->stat), user_buffer, size, offset);
+}
+
+static const struct file_operations kvm_vm_stats_fops = {
+ .read = kvm_vm_stats_read,
+ .llseek = noop_llseek,
+};
+
+static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
+{
+ int fd;
+ struct file *file;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ file = anon_inode_getfile("kvm-vm-stats",
+ &kvm_vm_stats_fops, kvm, O_RDONLY);
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ return PTR_ERR(file);
+ }
+ file->f_mode |= FMODE_PREAD;
+ fd_install(fd, file);
+
+ return fd;
+}
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4166,6 +4370,9 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_RESET_DIRTY_RINGS:
r = kvm_vm_ioctl_reset_dirty_pages(kvm);
break;
+ case KVM_GET_STATS_FD:
+ r = kvm_vm_ioctl_get_stats_fd(kvm);
+ break;
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
}
@@ -4245,6 +4452,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
if (r < 0)
goto put_kvm;
+ snprintf(kvm->stats_id, sizeof(kvm->stats_id),
+ "kvm-%d", task_pid_nr(current));
+
file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
if (IS_ERR(file)) {
put_unused_fd(r);
@@ -4739,7 +4949,7 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
return -ENOENT;
if (simple_attr_open(inode, file, get,
- KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
+ kvm_stats_debugfs_mode(stat_data->desc) & 0222
? set : NULL,
fmt)) {
kvm_put_kvm(stat_data->kvm);
@@ -4762,14 +4972,14 @@ static int kvm_debugfs_release(struct inode *inode, struct file *file)
static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
{
- *val = *(ulong *)((void *)kvm + offset);
+ *val = *(u64 *)((void *)(&kvm->stat) + offset);
return 0;
}
static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
{
- *(ulong *)((void *)kvm + offset) = 0;
+ *(u64 *)((void *)(&kvm->stat) + offset) = 0;
return 0;
}
@@ -4782,7 +4992,7 @@ static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
*val = 0;
kvm_for_each_vcpu(i, vcpu, kvm)
- *val += *(u64 *)((void *)vcpu + offset);
+ *val += *(u64 *)((void *)(&vcpu->stat) + offset);
return 0;
}
@@ -4793,7 +5003,7 @@ static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
- *(u64 *)((void *)vcpu + offset) = 0;
+ *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
return 0;
}
@@ -4803,14 +5013,14 @@ static int kvm_stat_data_get(void *data, u64 *val)
int r = -EFAULT;
struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
- switch (stat_data->dbgfs_item->kind) {
+ switch (stat_data->kind) {
case KVM_STAT_VM:
r = kvm_get_stat_per_vm(stat_data->kvm,
- stat_data->dbgfs_item->offset, val);
+ stat_data->desc->desc.offset, val);
break;
case KVM_STAT_VCPU:
r = kvm_get_stat_per_vcpu(stat_data->kvm,
- stat_data->dbgfs_item->offset, val);
+ stat_data->desc->desc.offset, val);
break;
}
@@ -4825,14 +5035,14 @@ static int kvm_stat_data_clear(void *data, u64 val)
if (val)
return -EINVAL;
- switch (stat_data->dbgfs_item->kind) {
+ switch (stat_data->kind) {
case KVM_STAT_VM:
r = kvm_clear_stat_per_vm(stat_data->kvm,
- stat_data->dbgfs_item->offset);
+ stat_data->desc->desc.offset);
break;
case KVM_STAT_VCPU:
r = kvm_clear_stat_per_vcpu(stat_data->kvm,
- stat_data->dbgfs_item->offset);
+ stat_data->desc->desc.offset);
break;
}
@@ -4889,6 +5099,7 @@ static int vm_stat_clear(void *_offset, u64 val)
}
DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
static int vcpu_stat_get(void *_offset, u64 *val)
{
@@ -4925,11 +5136,7 @@ static int vcpu_stat_clear(void *_offset, u64 val)
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
"%llu\n");
-
-static const struct file_operations *stat_fops[] = {
- [KVM_STAT_VCPU] = &vcpu_stat_fops,
- [KVM_STAT_VM] = &vm_stat_fops,
-};
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
{
@@ -4983,15 +5190,32 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
static void kvm_init_debug(void)
{
- struct kvm_stats_debugfs_item *p;
+ const struct file_operations *fops;
+ const struct _kvm_stats_desc *pdesc;
+ int i;
kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
- kvm_debugfs_num_entries = 0;
- for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
- debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
- kvm_debugfs_dir, (void *)(long)p->offset,
- stat_fops[p->kind]);
+ for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
+ pdesc = &kvm_vm_stats_desc[i];
+ if (kvm_stats_debugfs_mode(pdesc) & 0222)
+ fops = &vm_stat_fops;
+ else
+ fops = &vm_stat_readonly_fops;
+ debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+ kvm_debugfs_dir,
+ (void *)(long)pdesc->desc.offset, fops);
+ }
+
+ for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
+ pdesc = &kvm_vcpu_stats_desc[i];
+ if (kvm_stats_debugfs_mode(pdesc) & 0222)
+ fops = &vcpu_stat_fops;
+ else
+ fops = &vcpu_stat_readonly_fops;
+ debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+ kvm_debugfs_dir,
+ (void *)(long)pdesc->desc.offset, fops);
}
}
@@ -5141,7 +5365,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
SLAB_ACCOUNT,
offsetof(struct kvm_vcpu, arch),
- sizeof_field(struct kvm_vcpu, arch),
+ offsetofend(struct kvm_vcpu, stats_id)
+ - offsetof(struct kvm_vcpu, arch),
NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;