KVM: X86: Implement ring-based dirty memory tracking

This patch is heavily based on previous work from Lei Cao <lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1] KVM currently uses large bitmaps to track dirty memory. These bitmaps are copied to userspace when userspace queries KVM for its dirty page information. The use of bitmaps is mostly sufficient for live migration, as large parts of memory are be dirtied from one log-dirty pass to another. However, in a checkpointing system, the number of dirty pages is small and in fact it is often bounded---the VM is paused when it has dirtied a pre-defined number of pages. Traversing a large, sparsely populated bitmap to find set bits is time-consuming, as is copying the bitmap to user-space. A similar issue will be there for live migration when the guest memory is huge while the page dirty procedure is trivial. In that case for each dirty sync we need to pull the whole dirty bitmap to userspace and analyse every bit even if it's mostly zeros. The preferred data structure for above scenarios is a dense list of guest frame numbers (GFN). This patch series stores the dirty list in kernel memory that can be memory mapped into userspace to allow speedy harvesting. This patch enables dirty ring for X86 only. However it should be easily extended to other archs as well. [1] https://patchwork.kernel.org/patch/10471409/ Signed-off-by: Lei Cao <lei.cao@stratus.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20201001012222.5767-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
author: Peter Xu <peterx@redhat.com> 2020-09-30 21:22:22 -0400
committer: Paolo Bonzini <pbonzini@redhat.com> 2020-11-15 09:49:15 -0500
commit: fb04a1eddb1a65b6588a021bdc132270d5ae48bb (patch)
tree: 0faceb84959c8f1b65f91d51c1bcc0c0a38ece43 /arch
parent: 28bd726aa404c0da8fd6852fe69bb4538a103b71 (diff)
7 files changed, 31 insertions, 2 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69e94aa716e9..f002cdb13a0b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1232,6 +1232,7 @@ struct kvm_x86_ops {
 	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
 					   struct kvm_memory_slot *slot,
 					   gfn_t offset, unsigned long mask);
+	int (*cpu_dirty_log_size)(void);
 
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
@@ -1744,4 +1745,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 #define GET_SMSTATE(type, buf, offset)		\
 	(*(type *)((buf) + (offset) - 0x7e00))
 
+int kvm_cpu_dirty_log_size(void);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 89e5f3d1bba8..8e76d3701db3 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -12,6 +12,7 @@
 
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
 
 #define DE_VECTOR 0
 #define DB_VECTOR 1
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b804444e16d4..4bd14ab01323 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -10,7 +10,8 @@ endif
 KVM := ../../../virt/kvm
 
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+				$(KVM)/dirty_ring.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5bb1939b65d8..12e5cfe0995e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1289,6 +1289,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
+int kvm_cpu_dirty_log_size(void)
+{
+	if (kvm_x86_ops.cpu_dirty_log_size)
+		return kvm_x86_ops.cpu_dirty_log_size();
+
+	return 0;
+}
+
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 				    struct kvm_memory_slot *slot, u64 gfn)
 {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ff28a5c6abd6..cffa51c6049e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -185,7 +185,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
 	    is_writable_pte(new_spte)) {
 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
-		mark_page_dirty_in_slot(slot, gfn);
+		mark_page_dirty_in_slot(kvm, slot, gfn);
 	}
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 46b32aa43811..2b6d538454a6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7583,6 +7583,11 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
 	return supported & BIT(bit);
 }
 
+static int vmx_cpu_dirty_log_size(void)
+{
+	return enable_pml ? PML_ENTITY_NUM : 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.hardware_unsetup = hardware_unsetup,
 
@@ -7712,6 +7717,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.migrate_timers = vmx_migrate_timers,
 
 	.msr_filter_changed = vmx_msr_filter_changed,
+	.cpu_dirty_log_size = vmx_cpu_dirty_log_size,
 };
 
 static __init int hardware_setup(void)
@@ -7829,6 +7835,7 @@ static __init int hardware_setup(void)
 		vmx_x86_ops.slot_disable_log_dirty = NULL;
 		vmx_x86_ops.flush_log_dirty = NULL;
 		vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
+		vmx_x86_ops.cpu_dirty_log_size = NULL;
 	}
 
 	if (!cpu_has_vmx_preemption_timer())
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4ac726526f8..6c704a597b7c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8754,6 +8754,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	bool req_immediate_exit = false;
 
+	/* Forbid vmenter if vcpu dirty ring is soft-full */
+	if (unlikely(vcpu->kvm->dirty_ring_size &&
+		     kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
+		vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
+		trace_kvm_dirty_ring_exit(vcpu);
+		r = 0;
+		goto out;
+	}
+
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
 			if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
author	Peter Xu <peterx@redhat.com>	2020-09-30 21:22:22 -0400
committer	Paolo Bonzini <pbonzini@redhat.com>	2020-11-15 09:49:15 -0500
commit	fb04a1eddb1a65b6588a021bdc132270d5ae48bb (patch)
tree	0faceb84959c8f1b65f91d51c1bcc0c0a38ece43 /arch
parent	28bd726aa404c0da8fd6852fe69bb4538a103b71 (diff)