summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm64/include/asm/kvm_host.h9
-rw-r--r--arch/arm64/kvm/arm.c4
-rw-r--r--arch/arm64/kvm/guest.c5
-rw-r--r--arch/arm64/kvm/mmu.c118
-rw-r--r--arch/arm64/kvm/trace_arm.h66
-rw-r--r--arch/mips/include/asm/kvm_host.h17
-rw-r--r--arch/mips/kvm/mips.c21
-rw-r--r--arch/mips/kvm/mmu.c100
-rw-r--r--arch/mips/kvm/trap_emul.c13
-rw-r--r--arch/mips/kvm/vz.c19
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h12
-rw-r--r--arch/powerpc/include/asm/kvm_host.h7
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h9
-rw-r--r--arch/powerpc/kvm/book3s.c18
-rw-r--r--arch/powerpc/kvm/book3s.h10
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c98
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c25
-rw-r--r--arch/powerpc/kvm/book3s_hv.c12
-rw-r--r--arch/powerpc/kvm/book3s_pr.c56
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c29
-rw-r--r--arch/powerpc/kvm/trace_booke.h15
-rw-r--r--arch/s390/include/asm/kvm_host.h5
-rw-r--r--arch/s390/include/asm/smp.h1
-rw-r--r--arch/s390/kernel/smp.c1
-rw-r--r--arch/s390/kvm/diag.c31
-rw-r--r--arch/s390/kvm/gaccess.c30
-rw-r--r--arch/s390/kvm/gaccess.h60
-rw-r--r--arch/s390/kvm/kvm-s390.c15
-rw-r--r--arch/s390/kvm/kvm-s390.h8
-rw-r--r--arch/s390/kvm/vsie.c109
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h67
-rw-r--r--arch/x86/include/asm/sgx.h (renamed from arch/x86/kernel/cpu/sgx/arch.h)50
-rw-r--r--arch/x86/include/asm/svm.h4
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c3
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c71
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/Makefile1
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c17
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c33
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h30
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c43
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c264
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h40
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c376
-rw-r--r--arch/x86/kernel/kvm.c118
-rw-r--r--arch/x86/kvm/Kconfig12
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c89
-rw-r--r--arch/x86/kvm/cpuid.h58
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.h23
-rw-r--r--arch/x86/kvm/mmu/mmu.c637
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c2
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h44
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/mmu/spte.c159
-rw-r--r--arch/x86/kvm/mmu/spte.h141
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c740
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h51
-rw-r--r--arch/x86/kvm/svm/avic.c24
-rw-r--r--arch/x86/kvm/svm/nested.c573
-rw-r--r--arch/x86/kvm/svm/sev.c842
-rw-r--r--arch/x86/kvm/svm/svm.c977
-rw-r--r--arch/x86/kvm/svm/svm.h77
-rw-r--r--arch/x86/kvm/svm/vmenter.S47
-rw-r--r--arch/x86/kvm/vmx/nested.c67
-rw-r--r--arch/x86/kvm/vmx/nested.h5
-rw-r--r--arch/x86/kvm/vmx/sgx.c502
-rw-r--r--arch/x86/kvm/vmx/sgx.h34
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c1
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h4
-rw-r--r--arch/x86/kvm/vmx/vmx.c403
-rw-r--r--arch/x86/kvm/vmx/vmx.h24
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h4
-rw-r--r--arch/x86/kvm/x86.c185
-rw-r--r--arch/x86/kvm/x86.h10
81 files changed, 4995 insertions, 2796 deletions
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3d10e6527f7d..3a708be3ed71 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -401,6 +401,10 @@ struct kvm_vcpu_arch {
#define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */
#define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
+ KVM_GUESTDBG_USE_SW_BP | \
+ KVM_GUESTDBG_USE_HW | \
+ KVM_GUESTDBG_SINGLESTEP)
/*
* When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can
* take the following values:
@@ -582,11 +586,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events);
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 7f06ba76698d..0d92a4e5fe80 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -208,6 +208,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VCPU_ATTRIBUTES:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ return KVM_GUESTDBG_VALID_MASK;
case KVM_CAP_ARM_SET_DEVICE_ADDR:
r = 1;
break;
@@ -1268,7 +1270,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
kvm_flush_remote_tlbs(kvm);
}
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 9bbd30e62799..6cb39ee74acd 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -888,11 +888,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
return -EINVAL;
}
-#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
- KVM_GUESTDBG_USE_SW_BP | \
- KVM_GUESTDBG_USE_HW | \
- KVM_GUESTDBG_SINGLESTEP)
-
/**
* kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
* @kvm: pointer to the KVM struct
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 8711894db8c2..35728231e9a0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -839,7 +839,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
* the page we just got a reference to gets unmapped before we have a
* chance to grab the mmu_lock, which ensure that if the page gets
- * unmapped afterwards, the call to kvm_unmap_hva will take it away
+ * unmapped afterwards, the call to kvm_unmap_gfn will take it away
* from us again properly. This smp_rmb() interacts with the smp_wmb()
* in kvm_mmu_notifier_invalidate_<page|range_end>.
*/
@@ -1064,126 +1064,70 @@ out_unlock:
return ret;
}
-static int handle_hva_to_gpa(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- int (*handler)(struct kvm *kvm,
- gpa_t gpa, u64 size,
- void *data),
- void *data)
-{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- int ret = 0;
-
- slots = kvm_memslots(kvm);
-
- /* we only care about the pages that the guest sees */
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gpa;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
-
- gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
- ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
- }
-
- return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
- unsigned flags = *(unsigned *)data;
- bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
-
- __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
- return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end, unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
if (!kvm->arch.mmu.pgt)
return 0;
- trace_kvm_unmap_hva_range(start, end);
- handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
- return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
- kvm_pfn_t *pfn = (kvm_pfn_t *)data;
-
- WARN_ON(size != PAGE_SIZE);
+ __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
+ (range->end - range->start) << PAGE_SHIFT,
+ range->may_block);
- /*
- * The MMU notifiers will have unmapped a huge PMD before calling
- * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
- * therefore we never need to clear out a huge PMD through this
- * calling path and a memcache is not required.
- */
- kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
- __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
return 0;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- unsigned long end = hva + PAGE_SIZE;
- kvm_pfn_t pfn = pte_pfn(pte);
+ kvm_pfn_t pfn = pte_pfn(range->pte);
if (!kvm->arch.mmu.pgt)
return 0;
- trace_kvm_set_spte_hva(hva);
+ WARN_ON(range->end - range->start != 1);
/*
* We've moved a page around, probably through CoW, so let's treat it
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
+
+ /*
+ * The MMU notifiers will have unmapped a huge PMD before calling
+ * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
+ * therefore we never need to clear out a huge PMD through this
+ * calling path and a memcache is not required.
+ */
+ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
+ PAGE_SIZE, __pfn_to_phys(pfn),
+ KVM_PGTABLE_PROT_R, NULL);
+
return 0;
}
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- pte_t pte;
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
kvm_pte_t kpte;
+ pte_t pte;
+
+ if (!kvm->arch.mmu.pgt)
+ return 0;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+
+ kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT);
pte = __pte(kpte);
return pte_valid(pte) && pte_young(pte);
}
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
if (!kvm->arch.mmu.pgt)
return 0;
- trace_kvm_age_hva(start, end);
- return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
- if (!kvm->arch.mmu.pgt)
- return 0;
- trace_kvm_test_age_hva(hva);
- return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
- kvm_test_age_hva_handler, NULL);
+ return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT);
}
phys_addr_t kvm_mmu_get_httbr(void)
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index ff0444352bba..33e4e7dd2719 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -135,72 +135,6 @@ TRACE_EVENT(kvm_mmio_emulate,
__entry->vcpu_pc, __entry->instr, __entry->cpsr)
);
-TRACE_EVENT(kvm_unmap_hva_range,
- TP_PROTO(unsigned long start, unsigned long end),
- TP_ARGS(start, end),
-
- TP_STRUCT__entry(
- __field( unsigned long, start )
- __field( unsigned long, end )
- ),
-
- TP_fast_assign(
- __entry->start = start;
- __entry->end = end;
- ),
-
- TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
- __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
- TP_PROTO(unsigned long start, unsigned long end),
- TP_ARGS(start, end),
-
- TP_STRUCT__entry(
- __field( unsigned long, start )
- __field( unsigned long, end )
- ),
-
- TP_fast_assign(
- __entry->start = start;
- __entry->end = end;
- ),
-
- TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
- __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
-);
-
TRACE_EVENT(kvm_set_way_flush,
TP_PROTO(unsigned long vcpu_pc, bool cache),
TP_ARGS(vcpu_pc, cache),
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 3a5612e7304c..d0944a75fc8d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -815,14 +815,7 @@ struct kvm_mips_callbacks {
int (*vcpu_init)(struct kvm_vcpu *vcpu);
void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
int (*vcpu_setup)(struct kvm_vcpu *vcpu);
- void (*flush_shadow_all)(struct kvm *kvm);
- /*
- * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
- * VZ root TLB, or T&E GVA page tables and corresponding root TLB
- * mappings).
- */
- void (*flush_shadow_memslot)(struct kvm *kvm,
- const struct kvm_memory_slot *slot);
+ void (*prepare_flush_shadow)(struct kvm *kvm);
gpa_t (*gva_to_gpa)(gva_t gva);
void (*queue_timer_int)(struct kvm_vcpu *vcpu);
void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
@@ -967,11 +960,6 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
bool write);
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
/* Emulation */
int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
@@ -1154,4 +1142,7 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
+int kvm_arch_flush_remote_tlb(struct kvm *kvm);
+
#endif /* __MIPS_KVM_HOST_H__ */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 58a8812e2fa5..4a22ba70c943 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -204,9 +204,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
/* Flush whole GPA */
kvm_mips_flush_gpa_pt(kvm, 0, ~0);
-
- /* Let implementation do the rest */
- kvm_mips_callbacks->flush_shadow_all(kvm);
+ kvm_flush_remote_tlbs(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -221,8 +219,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
/* Flush slot from GPA */
kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
slot->base_gfn + slot->npages - 1);
- /* Let implementation do the rest */
- kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
spin_unlock(&kvm->mmu_lock);
}
@@ -262,9 +259,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
/* Write protect GPA page table entries */
needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
new->base_gfn + new->npages - 1);
- /* Let implementation do the rest */
if (needs_flush)
- kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
+ kvm_arch_flush_remote_tlbs_memslot(kvm, new);
spin_unlock(&kvm->mmu_lock);
}
}
@@ -996,11 +992,16 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
}
+int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+{
+ kvm_mips_callbacks->prepare_flush_shadow(kvm);
+ return 1;
+}
+
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
- /* Let implementation handle TLB/GVA invalidation */
- kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
+ kvm_flush_remote_tlbs(kvm);
}
long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 3dabeda82458..8af002bd3c07 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -439,85 +439,34 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
end_gfn << PAGE_SHIFT);
}
-static int handle_hva_to_gpa(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- int (*handler)(struct kvm *kvm, gfn_t gfn,
- gpa_t gfn_end,
- struct kvm_memory_slot *memslot,
- void *data),
- void *data)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- int ret = 0;
-
- slots = kvm_memslots(kvm);
-
- /* we only care about the pages that the guest sees */
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
-
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- ret |= handler(kvm, gfn, gfn_end, memslot, data);
- }
-
- return ret;
-}
-
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
-{
- kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
+ kvm_mips_flush_gpa_pt(kvm, range->start, range->end);
return 1;
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
-{
- handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-
- kvm_mips_callbacks->flush_shadow_all(kvm);
- return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- gpa_t gpa = gfn << PAGE_SHIFT;
- pte_t hva_pte = *(pte_t *)data;
+ gpa_t gpa = range->start << PAGE_SHIFT;
+ pte_t hva_pte = range->pte;
pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
pte_t old_pte;
if (!gpa_pte)
- return 0;
+ return false;
/* Mapping may need adjusting depending on memslot flags */
old_pte = *gpa_pte;
- if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
+ if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
hva_pte = pte_mkclean(hva_pte);
- else if (memslot->flags & KVM_MEM_READONLY)
+ else if (range->slot->flags & KVM_MEM_READONLY)
hva_pte = pte_wrprotect(hva_pte);
set_pte(gpa_pte, hva_pte);
/* Replacing an absent or old page doesn't need flushes */
if (!pte_present(old_pte) || !pte_young(old_pte))
- return 0;
+ return false;
/* Pages swapped, aged, moved, or cleaned require flushes */
return !pte_present(hva_pte) ||
@@ -526,27 +475,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
(pte_dirty(old_pte) && !pte_dirty(hva_pte));
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
- unsigned long end = hva + PAGE_SIZE;
- int ret;
-
- ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
- if (ret)
- kvm_mips_callbacks->flush_shadow_all(kvm);
- return 0;
-}
-
-static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
+ return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end);
}
-static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- gpa_t gpa = gfn << PAGE_SHIFT;
+ gpa_t gpa = range->start << PAGE_SHIFT;
pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
if (!gpa_pte)
@@ -554,16 +490,6 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
return pte_young(*gpa_pte);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
- return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
- return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
-}
-
/**
* _kvm_mips_map_page_fast() - Fast path GPA fault handler.
* @vcpu: VCPU pointer.
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 0788c00d7e94..5f2df497599c 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -687,16 +687,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
return 0;
}
-static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm)
+static void kvm_trap_emul_prepare_flush_shadow(struct kvm *kvm)
{
- /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */
- kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
- const struct kvm_memory_slot *slot)
-{
- kvm_trap_emul_flush_shadow_all(kvm);
}
static u64 kvm_trap_emul_get_one_regs[] = {
@@ -1280,8 +1272,7 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
.vcpu_init = kvm_trap_emul_vcpu_init,
.vcpu_uninit = kvm_trap_emul_vcpu_uninit,
.vcpu_setup = kvm_trap_emul_vcpu_setup,
- .flush_shadow_all = kvm_trap_emul_flush_shadow_all,
- .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot,
+ .prepare_flush_shadow = kvm_trap_emul_prepare_flush_shadow,
.gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb,
.queue_timer_int = kvm_mips_queue_timer_int_cb,
.dequeue_timer_int = kvm_mips_dequeue_timer_int_cb,
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 2ffbe9264a31..2c75571dc4a2 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -3211,32 +3211,22 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
return 0;
}
-static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+static void kvm_vz_prepare_flush_shadow(struct kvm *kvm)
{
- if (cpu_has_guestid) {
- /* Flush GuestID for each VCPU individually */
- kvm_flush_remote_tlbs(kvm);
- } else {
+ if (!cpu_has_guestid) {
/*
* For each CPU there is a single GPA ASID used by all VCPUs in
* the VM, so it doesn't make sense for the VCPUs to handle
* invalidation of these ASIDs individually.
*
* Instead mark all CPUs as needing ASID invalidation in
- * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+ * asid_flush_mask, and kvm_flush_remote_tlbs(kvm) will
* kick any running VCPUs so they check asid_flush_mask.
*/
cpumask_setall(&kvm->arch.asid_flush_mask);
- kvm_flush_remote_tlbs(kvm);
}
}
-static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
- const struct kvm_memory_slot *slot)
-{
- kvm_vz_flush_shadow_all(kvm);
-}
-
static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu)
{
int cpu = smp_processor_id();
@@ -3292,8 +3282,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
.vcpu_init = kvm_vz_vcpu_init,
.vcpu_uninit = kvm_vz_vcpu_uninit,
.vcpu_setup = kvm_vz_vcpu_setup,
- .flush_shadow_all = kvm_vz_flush_shadow_all,
- .flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+ .prepare_flush_shadow = kvm_vz_prepare_flush_shadow,
.gva_to_gpa = kvm_vz_gva_to_gpa_cb,
.queue_timer_int = kvm_vz_queue_timer_int_cb,
.dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 2f5f919f6cd3..2d03f2930767 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -210,12 +210,12 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
unsigned int lpid);
extern int kvmppc_radix_init(void);
extern void kvmppc_radix_exit(void);
-extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
-extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
-extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
+extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
+extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
+extern bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map);
extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 05fb00d37609..1e83359f286b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -56,13 +56,6 @@
#define KVM_ARCH_WANT_MMU_NOTIFIER
-extern int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end,
- unsigned flags);
-extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-
#define HPTEG_CACHE_NUM (1 << 15)
#define HPTEG_HASH_BITS_PTE 13
#define HPTEG_HASH_BITS_PTE_LONG 12
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 8aacd76bb702..21ab0332eb42 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -281,11 +281,10 @@ struct kvmppc_ops {
const struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change);
- int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
- unsigned long end);
- int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end);
- int (*test_age_hva)(struct kvm *kvm, unsigned long hva);
- void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte);
+ bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
+ bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+ bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+ bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
void (*free_memslot)(struct kvm_memory_slot *slot);
int (*init_vm)(struct kvm *kvm);
void (*destroy_vm)(struct kvm *kvm);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 44bf567b6589..2b691f4d1f26 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -834,26 +834,24 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change);
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
+ return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm->arch.kvm_ops->age_hva(kvm, start, end);
+ return kvm->arch.kvm_ops->age_gfn(kvm, range);
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
+ return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
- return 0;
+ return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
}
int kvmppc_core_init_vm(struct kvm *kvm)
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 9b6323ec8e60..740e51def5a5 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -9,12 +9,10 @@
extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
struct kvm_memory_slot *memslot);
-extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
- unsigned long end);
-extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start,
- unsigned long end);
-extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva);
-extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte);
+extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bb6773594cf8..b7bd9ca040b8 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -752,51 +752,6 @@ void kvmppc_rmap_reset(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
-typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
-
-static int kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- hva_handler_fn handler)
-{
- int ret;
- int retval = 0;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
-
- slots = kvm_memslots(kvm);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn, gfn+1, ..., gfn_end-1}.
- */
- gfn = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- for (; gfn < gfn_end; ++gfn) {
- ret = handler(kvm, memslot, gfn);
- retval |= ret;
- }
- }
-
- return retval;
-}
-
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- hva_handler_fn handler)
-{
- return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
-}
-
/* Must be called with both HPTE and rmap locked */
static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
struct kvm_memory_slot *memslot,
@@ -840,8 +795,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
}
}
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
unsigned long i;
__be64 *hptep;
@@ -874,16 +829,15 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
unlock_rmap(rmapp);
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
}
- return 0;
+ return false;
}
-int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ return kvm_unmap_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
- kvm_handle_hva_range(kvm, start, end, handler);
- return 0;
+ return kvm_unmap_rmapp(kvm, range->slot, range->start);
}
void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
@@ -913,8 +867,8 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
}
}
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
@@ -968,26 +922,26 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
return ret;
}
-int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ kvm_age_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
- return kvm_handle_hva_range(kvm, start, end, handler);
+ return kvm_age_rmapp(kvm, range->slot, range->start);
}
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
unsigned long *hp;
- int ret = 1;
+ bool ret = true;
unsigned long *rmapp;
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
if (*rmapp & KVMPPC_RMAP_REFERENCED)
- return 1;
+ return true;
lock_rmap(rmapp);
if (*rmapp & KVMPPC_RMAP_REFERENCED)
@@ -1002,27 +956,27 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
goto out;
} while ((i = j) != head);
}
- ret = 0;
+ ret = false;
out:
unlock_rmap(rmapp);
return ret;
}
-int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ kvm_test_age_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
- return kvm_handle_hva(kvm, hva, handler);
+ return kvm_test_age_rmapp(kvm, range->slot, range->start);
}
-void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ return kvm_unmap_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
- kvm_handle_hva(kvm, hva, handler);
+ return kvm_unmap_rmapp(kvm, range->slot, range->start);
}
static int vcpus_running(struct kvm *kvm)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index e603de7ade52..ec4f58fa9f5a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -993,8 +993,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
}
/* Called with kvm->mmu_lock held */
-int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
@@ -1002,24 +1002,24 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
- return 0;
+ return false;
}
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
kvm->arch.lpid);
- return 0;
+ return false;
}
/* Called with kvm->mmu_lock held */
-int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
- int ref = 0;
+ bool ref = false;
unsigned long old, *rmapp;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
@@ -1035,26 +1035,27 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
old & PTE_RPN_MASK,
1UL << shift);
- ref = 1;
+ ref = true;
}
return ref;
}
/* Called with kvm->mmu_lock held */
-int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
+
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
- int ref = 0;
+ bool ref = false;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
return ref;
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep) && pte_young(*ptep))
- ref = 1;
+ ref = true;
return ref;
}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 13bad6bf4c95..07682ad4110e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4770,7 +4770,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
kvmhv_release_all_nested(kvm);
kvmppc_rmap_reset(kvm);
kvm->arch.process_table = 0;
- /* Mutual exclusion with kvm_unmap_hva_range etc. */
+ /* Mutual exclusion with kvm_unmap_gfn_range etc. */
spin_lock(&kvm->mmu_lock);
kvm->arch.radix = 0;
spin_unlock(&kvm->mmu_lock);
@@ -4792,7 +4792,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
if (err)
return err;
kvmppc_rmap_reset(kvm);
- /* Mutual exclusion with kvm_unmap_hva_range etc. */
+ /* Mutual exclusion with kvm_unmap_gfn_range etc. */
spin_lock(&kvm->mmu_lock);
kvm->arch.radix = 1;
spin_unlock(&kvm->mmu_lock);
@@ -5654,10 +5654,10 @@ static struct kvmppc_ops kvm_ops_hv = {
.flush_memslot = kvmppc_core_flush_memslot_hv,
.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
.commit_memory_region = kvmppc_core_commit_memory_region_hv,
- .unmap_hva_range = kvm_unmap_hva_range_hv,
- .age_hva = kvm_age_hva_hv,
- .test_age_hva = kvm_test_age_hva_hv,
- .set_spte_hva = kvm_set_spte_hva_hv,
+ .unmap_gfn_range = kvm_unmap_gfn_range_hv,
+ .age_gfn = kvm_age_gfn_hv,
+ .test_age_gfn = kvm_test_age_gfn_hv,
+ .set_spte_gfn = kvm_set_spte_gfn_hv,
.free_memslot = kvmppc_core_free_memslot_hv,
.init_vm = kvmppc_core_init_vm_hv,
.destroy_vm = kvmppc_core_destroy_vm_hv,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 913944dc3620..d7733b07f489 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -425,61 +425,39 @@ static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
}
/************* MMU Notifiers *************/
-static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
long i;
struct kvm_vcpu *vcpu;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- slots = kvm_memslots(kvm);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn, gfn_end;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT,
+ range->end << PAGE_SHIFT);
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn, gfn+1, ..., gfn_end-1}.
- */
- gfn = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT,
- gfn_end << PAGE_SHIFT);
- }
+ return false;
}
-static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
- do_kvm_unmap_hva(kvm, start, end);
-
- return 0;
+ return do_kvm_unmap_gfn(kvm, range);
}
-static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva)
+static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte)
+static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* The page will get remapped properly on its next fault */
- do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
+ return do_kvm_unmap_gfn(kvm, range);
}
/*****************************************/
@@ -2079,10 +2057,10 @@ static struct kvmppc_ops kvm_ops_pr = {
.flush_memslot = kvmppc_core_flush_memslot_pr,
.prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
.commit_memory_region = kvmppc_core_commit_memory_region_pr,
- .unmap_hva_range = kvm_unmap_hva_range_pr,
- .age_hva = kvm_age_hva_pr,
- .test_age_hva = kvm_test_age_hva_pr,
- .set_spte_hva = kvm_set_spte_hva_pr,
+ .unmap_gfn_range = kvm_unmap_gfn_range_pr,
+ .age_gfn = kvm_age_gfn_pr,
+ .test_age_gfn = kvm_test_age_gfn_pr,
+ .set_spte_gfn = kvm_set_spte_gfn_pr,
.free_memslot = kvmppc_core_free_memslot_pr,
.init_vm = kvmppc_core_init_vm_pr,
.destroy_vm = kvmppc_core_destroy_vm_pr,
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index ed0c9c43d0cf..7f16afc331ef 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -721,45 +721,36 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
/************* MMU Notifiers *************/
-static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- trace_kvm_unmap_hva(hva);
-
/*
* Flush all shadow tlb entries everywhere. This is slow, but
* we are 100% sure that we catch the to be unmapped page
*/
- kvm_flush_remote_tlbs(kvm);
-
- return 0;
+ return true;
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- /* kvm_unmap_hva flushes everything anyways */
- kvm_unmap_hva(kvm, start);
-
- return 0;
+ return kvm_e500_mmu_unmap_gfn(kvm, range);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* The page will get remapped properly on its next fault */
- kvm_unmap_hva(kvm, hva);
- return 0;
+ return kvm_e500_mmu_unmap_gfn(kvm, range);
}
/*****************************************/
diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h
index 3837842986aa..eff6e82dbcd4 100644
--- a/arch/powerpc/kvm/trace_booke.h
+++ b/arch/powerpc/kvm/trace_booke.h
@@ -69,21 +69,6 @@ TRACE_EVENT(kvm_exit,
)
);
-TRACE_EVENT(kvm_unmap_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("unmap hva 0x%lx\n", __entry->hva)
-);
-
TRACE_EVENT(kvm_booke206_stlb_write,
TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 6bcfc5614bbc..8925f3969478 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -454,6 +454,7 @@ struct kvm_vcpu_stat {
u64 diagnose_44;
u64 diagnose_9c;
u64 diagnose_9c_ignored;
+ u64 diagnose_9c_forward;
u64 diagnose_258;
u64 diagnose_308;
u64 diagnose_500;
@@ -700,6 +701,10 @@ struct kvm_hw_bp_info_arch {
#define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
(vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\
+ KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING)
+
struct kvm_guestdbg_info_arch {
unsigned long cr0;
unsigned long cr9;
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index 01e360004481..e317fd4866c1 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -63,5 +63,6 @@ extern void __noreturn cpu_die(void);
extern void __cpu_die(unsigned int cpu);
extern int __cpu_disable(void);
extern void schedule_mcck_handler(void);
+void notrace smp_yield_cpu(int cpu);
#endif /* __ASM_SMP_H */
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 58c8afa3da65..2fec2b80d35d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -429,6 +429,7 @@ void notrace smp_yield_cpu(int cpu)
asm volatile("diag %0,0,0x9c"
: : "d" (pcpu_devices[cpu].address));
}
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
/*
* Send cpus emergency shutdown signal. This gives the cpus the
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 5b8ec1c447e1..02c146f9e5cd 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -150,6 +150,19 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
return 0;
}
+static int forward_cnt;
+static unsigned long cur_slice;
+
+static int diag9c_forwarding_overrun(void)
+{
+ /* Reset the count on a new slice */
+ if (time_after(jiffies, cur_slice)) {
+ cur_slice = jiffies;
+ forward_cnt = diag9c_forwarding_hz / HZ;
+ }
+ return forward_cnt-- <= 0 ? 1 : 0;
+}
+
static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *tcpu;
@@ -167,9 +180,21 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
if (!tcpu)
goto no_yield;
- /* target already running */
- if (READ_ONCE(tcpu->cpu) >= 0)
- goto no_yield;
+ /* target guest VCPU already running */
+ if (READ_ONCE(tcpu->cpu) >= 0) {
+ if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
+ goto no_yield;
+
+ /* target host CPU already running */
+ if (!vcpu_is_preempted(tcpu->cpu))
+ goto no_yield;
+ smp_yield_cpu(tcpu->cpu);
+ VCPU_EVENT(vcpu, 5,
+ "diag time slice end directed to %d: yield forwarded",
+ tid);
+ vcpu->stat.diagnose_9c_forward++;
+ return 0;
+ }
if (kvm_vcpu_yield_to(tcpu) <= 0)
goto no_yield;
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 6d6b57059493..b9f85b2dc053 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -976,7 +976,9 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
* kvm_s390_shadow_tables - walk the guest page table and create shadow tables
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ * successful (return value 0), or to the first invalid DAT entry in
+ * case of exceptions (return value > 0)
* @fake: pgt references contiguous guest memory block, not a pgtable
*/
static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
@@ -1034,6 +1036,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
rfte.val = ptr;
goto shadow_r2t;
}
+ *pgt = ptr + vaddr.rfx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
if (rc)
return rc;
@@ -1060,6 +1063,7 @@ shadow_r2t:
rste.val = ptr;
goto shadow_r3t;
}
+ *pgt = ptr + vaddr.rsx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
if (rc)
return rc;
@@ -1087,6 +1091,7 @@ shadow_r3t:
rtte.val = ptr;
goto shadow_sgt;
}
+ *pgt = ptr + vaddr.rtx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
if (rc)
return rc;
@@ -1123,6 +1128,7 @@ shadow_sgt:
ste.val = ptr;
goto shadow_pgt;
}
+ *pgt = ptr + vaddr.sx * 8;
rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
if (rc)
return rc;
@@ -1157,6 +1163,8 @@ shadow_pgt:
* @vcpu: virtual cpu
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ * the valid leaf, plus some flags
*
* Returns: - 0 if the shadow fault was successfully resolved
* - > 0 (pgm exception code) on exceptions while faulting
@@ -1165,11 +1173,11 @@ shadow_pgt:
* - -ENOMEM if out of memory
*/
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
- unsigned long saddr)
+ unsigned long saddr, unsigned long *datptr)
{
union vaddress vaddr;
union page_table_entry pte;
- unsigned long pgt;
+ unsigned long pgt = 0;
int dat_protection, fake;
int rc;
@@ -1191,8 +1199,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
pte.val = pgt + vaddr.px * PAGE_SIZE;
goto shadow_page;
}
- if (!rc)
- rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+ switch (rc) {
+ case PGM_SEGMENT_TRANSLATION:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_FIRST_TRANS:
+ pgt |= PEI_NOT_PTE;
+ break;
+ case 0:
+ pgt += vaddr.px * 8;
+ rc = gmap_read_table(sg->parent, pgt, &pte.val);
+ }
+ if (datptr)
+ *datptr = pgt | dat_protection * PEI_DAT_PROT;
if (!rc && pte.i)
rc = PGM_PAGE_TRANSLATION;
if (!rc && pte.z)
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index f4c51756c462..7c72a5e3449f 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -18,17 +18,14 @@
/**
* kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
* @gra - guest real address
*
* Returns the guest absolute address that corresponds to the passed guest real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
*/
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
- unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
{
- unsigned long prefix = kvm_s390_get_prefix(vcpu);
-
if (gra < 2 * PAGE_SIZE)
gra += prefix;
else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
@@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
}
/**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+ unsigned long gra)
+{
+ return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
+}
+
+/**
+ * _kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @psw: psw of the guest
+ * @ga: guest logical address
+ *
+ * Convert a guest logical address to an effective address by applying the
+ * rules of the addressing mode defined by bits 31 and 32 of the given PSW
+ * (extendended/basic addressing mode).
+ *
+ * Depending on the addressing mode, the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
+ * mode) of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
+ unsigned long ga)
+{
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
+ return ga;
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
+ return ga & ((1UL << 31) - 1);
+ return ga & ((1UL << 24) - 1);
+}
+
+/**
* kvm_s390_logical_to_effective - convert guest logical to effective address
* @vcpu: guest virtual cpu
* @ga: guest logical address
@@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
unsigned long ga)
{
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
-
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
- return ga;
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
- return ga & ((1UL << 31) - 1);
- return ga & ((1UL << 24) - 1);
+ return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
}
/*
@@ -359,7 +387,11 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
int ipte_lock_held(struct kvm_vcpu *vcpu);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
- unsigned long saddr);
+ unsigned long saddr, unsigned long *datptr);
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2f09e9d7dc95..1296fc10f80c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -158,6 +158,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("instruction_diag_44", diagnose_44),
VCPU_STAT("instruction_diag_9c", diagnose_9c),
VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
+ VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
VCPU_STAT("instruction_diag_258", diagnose_258),
VCPU_STAT("instruction_diag_308", diagnose_308),
VCPU_STAT("instruction_diag_500", diagnose_500),
@@ -185,6 +186,11 @@ static bool use_gisa = true;
module_param(use_gisa, bool, 0644);
MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
+/* maximum diag9c forwarding per second */
+unsigned int diag9c_forwarding_hz;
+module_param(diag9c_forwarding_hz, uint, 0644);
+MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
+
/*
* For now we handle at most 16 double words as this is what the s390 base
* kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -544,6 +550,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_DIAG318:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ r = KVM_GUESTDBG_VALID_MASK;
+ break;
case KVM_CAP_S390_HPAGE_1M:
r = 0;
if (hpage && !kvm_is_ucontrol(kvm))
@@ -4307,16 +4316,16 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu)
kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
if (MACHINE_HAS_GS) {
+ preempt_disable();
__ctl_set_bit(2, 4);
if (vcpu->arch.gs_enabled)
save_gs_cb(current->thread.gs_cb);
- preempt_disable();
current->thread.gs_cb = vcpu->arch.host_gscb;
restore_gs_cb(vcpu->arch.host_gscb);
- preempt_enable();
if (!vcpu->arch.host_gscb)
__ctl_clear_bit(2, 4);
vcpu->arch.host_gscb = NULL;
+ preempt_enable();
}
/* SIE will save etoken directly into SDNX and therefore kvm_run */
}
@@ -4542,7 +4551,7 @@ int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
/*
* As we are starting a second VCPU, we have to disable
* the IBS facility on all VCPUs to remove potentially
- * oustanding ENABLE requests.
+ * outstanding ENABLE requests.
*/
__disable_ibs_on_all_vcpus(vcpu->kvm);
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 79dcd647b378..9fad25109b0d 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -471,4 +471,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
* @kvm: the KVM guest
*/
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
+
+/**
+ * diag9c_forwarding_hz
+ *
+ * Set the maximum number of diag9c forwarding per second
+ */
+extern unsigned int diag9c_forwarding_hz;
+
#endif
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index bd803e091918..4002a24bc43a 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -417,11 +417,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
memcpy((void *)((u64)scb_o + 0xc0),
(void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
break;
- case ICPT_PARTEXEC:
- /* MVPG only */
- memcpy((void *)((u64)scb_o + 0xc0),
- (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
- break;
}
if (scb_s->ihcpu != 0xffffU)
@@ -620,10 +615,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* with mso/msl, the prefix lies at offset *mso* */
prefix += scb_s->mso;
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
if (!rc && (scb_s->ecb & ECB_TE))
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- prefix + PAGE_SIZE);
+ prefix + PAGE_SIZE, NULL);
/*
* We don't have to mprotect, we will be called for all unshadows.
* SIE will detect if protection applies and trigger a validity.
@@ -914,7 +909,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
current->thread.gmap_addr, 1);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_addr);
+ current->thread.gmap_addr, NULL);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
current->thread.gmap_addr,
@@ -936,7 +931,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
{
if (vsie_page->fault_addr)
kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- vsie_page->fault_addr);
+ vsie_page->fault_addr, NULL);
vsie_page->fault_addr = 0;
}
@@ -984,6 +979,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/*
+ * Get a register for a nested guest.
+ * @vcpu the vcpu of the guest
+ * @vsie_page the vsie_page for the nested guest
+ * @reg the register number, the upper 4 bits are ignored.
+ * returns: the value of the register.
+ */
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+ /* no need to validate the parameter and/or perform error handling */
+ reg &= 0xf;
+ switch (reg) {
+ case 15:
+ return vsie_page->scb_s.gg15;
+ case 14:
+ return vsie_page->scb_s.gg14;
+ default:
+ return vcpu->run->s.regs.gprs[reg];
+ }
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+ u64 *pei_block = &vsie_page->scb_o->mcic;
+ int edat, rc_dest, rc_src;
+ union ctlreg0 cr0;
+
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+ mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
+ prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+
+ dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+ dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
+ src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+ src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
+
+ rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
+ rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+ /*
+ * Either everything went well, or something non-critical went wrong
+ * e.g. because of a race. In either case, simply retry.
+ */
+ if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
+ retry_vsie_icpt(vsie_page);
+ return -EAGAIN;
+ }
+ /* Something more serious went wrong, propagate the error */
+ if (rc_dest < 0)
+ return rc_dest;
+ if (rc_src < 0)
+ return rc_src;
+
+ /* The only possible suppressing exception: just deliver it */
+ if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
+ clear_vsie_icpt(vsie_page);
+ rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
+ WARN_ON_ONCE(rc_dest);
+ return 1;
+ }
+
+ /*
+ * Forward the PEI intercept to the guest if it was a page fault, or
+ * also for segment and region table faults if EDAT applies.
+ */
+ if (edat) {
+ rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
+ rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
+ } else {
+ rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
+ rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
+ }
+ if (!rc_dest && !rc_src) {
+ pei_block[0] = pei_dest;
+ pei_block[1] = pei_src;
+ return 1;
+ }
+
+ retry_vsie_icpt(vsie_page);
+
+ /*
+ * The host has edat, and the guest does not, or it was an ASCE type
+ * exception. The host needs to inject the appropriate DAT interrupts
+ * into the guest.
+ */
+ if (rc_dest)
+ return inject_fault(vcpu, rc_dest, dest, 1);
+ return inject_fault(vcpu, rc_src, src, 0);
+}
+
+/*
* Run the vsie on a shadow scb and a shadow gmap, without any further
* sanity checks, handling SIE faults.
*
@@ -1071,6 +1158,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if ((scb_s->ipa & 0xf000) != 0xf000)
scb_s->ipa += 0x1000;
break;
+ case ICPT_PARTEXEC:
+ if (scb_s->ipa == 0xb254)
+ rc = vsie_handle_mvpg(vcpu, vsie_page);
+ break;
}
return rc;
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..35391e94bd22 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1931,6 +1931,7 @@ config X86_SGX
depends on CRYPTO_SHA256=y
select SRCU
select MMU_NOTIFIER
+ select NUMA_KEEP_MEMINFO if NUMA
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index cc96e26d69f7..dddc746b5455 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -290,6 +290,8 @@
#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
@@ -336,6 +338,7 @@
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3768819693e5..ad22d4839bcc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -221,12 +221,22 @@ enum x86_intercept_stage;
#define DR7_FIXED_1 0x00000400
#define DR7_VOLATILE 0xffff2bff
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | \
+ KVM_GUESTDBG_SINGLESTEP | \
+ KVM_GUESTDBG_USE_HW_BP | \
+ KVM_GUESTDBG_USE_SW_BP | \
+ KVM_GUESTDBG_INJECT_BP | \
+ KVM_GUESTDBG_INJECT_DB)
+
+
#define PFERR_PRESENT_BIT 0
#define PFERR_WRITE_BIT 1
#define PFERR_USER_BIT 2
#define PFERR_RSVD_BIT 3
#define PFERR_FETCH_BIT 4
#define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
#define PFERR_GUEST_FINAL_BIT 32
#define PFERR_GUEST_PAGE_BIT 33
@@ -236,6 +246,7 @@ enum x86_intercept_stage;
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
@@ -1054,6 +1065,9 @@ struct kvm_arch {
u32 user_space_msr_mask;
struct kvm_x86_msr_filter __rcu *msr_filter;
+ /* Guest can access the SGX PROVISIONKEY. */
+ bool sgx_provisioning_allowed;
+
struct kvm_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
@@ -1068,25 +1082,36 @@ struct kvm_arch {
bool tdp_mmu_enabled;
/*
- * List of struct kvmp_mmu_pages being used as roots.
+ * List of struct kvm_mmu_pages being used as roots.
* All struct kvm_mmu_pages in the list should have
* tdp_mmu_page set.
- * All struct kvm_mmu_pages in the list should have a positive
- * root_count except when a thread holds the MMU lock and is removing
- * an entry from the list.
+ *
+ * For reads, this list is protected by:
+ * the MMU lock in read mode + RCU or
+ * the MMU lock in write mode
+ *
+ * For writes, this list is protected by:
+ * the MMU lock in read mode + the tdp_mmu_pages_lock or
+ * the MMU lock in write mode
+ *
+ * Roots will remain in the list until their tdp_mmu_root_count
+ * drops to zero, at which point the thread that decremented the
+ * count to zero should removed the root from the list and clean
+ * it up, freeing the root after an RCU grace period.
*/
struct list_head tdp_mmu_roots;
/*
* List of struct kvmp_mmu_pages not being used as roots.
* All struct kvm_mmu_pages in the list should have
- * tdp_mmu_page set and a root_count of 0.
+ * tdp_mmu_page set and a tdp_mmu_root_count of 0.
*/
struct list_head tdp_mmu_pages;
/*
* Protects accesses to the following fields when the MMU lock
* is held in read mode:
+ * - tdp_mmu_roots (above)
* - tdp_mmu_pages (above)
* - the link field of struct kvm_mmu_pages used by the TDP MMU
* - lpage_disallowed_mmu_pages
@@ -1143,6 +1168,9 @@ struct kvm_vcpu_stat {
u64 req_event;
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
+ u64 nested_run;
+ u64 directed_yield_attempted;
+ u64 directed_yield_successful;
};
struct x86_instruction_info;
@@ -1269,8 +1297,8 @@ struct kvm_x86_ops {
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
- void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
- int pgd_level);
+ void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level);
bool (*has_wbinvd_exit)(void);
@@ -1339,6 +1367,7 @@ struct kvm_x86_ops {
int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
@@ -1357,6 +1386,7 @@ struct kvm_x86_ops {
struct kvm_x86_nested_ops {
int (*check_events)(struct kvm_vcpu *vcpu);
bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+ void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
unsigned user_data_size);
@@ -1428,9 +1458,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask, u64 me_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1440,8 +1467,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1538,6 +1563,11 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -1566,14 +1596,14 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -1614,9 +1644,6 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free);
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -1735,11 +1762,7 @@ asmlinkage void kvm_spurious_fault(void);
_ASM_EXTABLE(666b, 667b)
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_cpu_has_extint(struct kvm_vcpu *v);
diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx.h
index dd7602c44c72..a16e2c9154a3 100644
--- a/arch/x86/kernel/cpu/sgx/arch.h
+++ b/arch/x86/include/asm/sgx.h
@@ -2,15 +2,20 @@
/**
* Copyright(c) 2016-20 Intel Corporation.
*
- * Contains data structures defined by the SGX architecture. Data structures
- * defined by the Linux software stack should not be placed here.
+ * Intel Software Guard Extensions (SGX) support.
*/
-#ifndef _ASM_X86_SGX_ARCH_H
-#define _ASM_X86_SGX_ARCH_H
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
#include <linux/bits.h>
#include <linux/types.h>
+/*
+ * This file contains both data structures defined by SGX architecture and Linux
+ * defined software data structures and functions. The two should not be mixed
+ * together for better readibility. The architectural definitions come first.
+ */
+
/* The SGX specific CPUID function. */
#define SGX_CPUID 0x12
/* EPC enumeration. */
@@ -22,16 +27,36 @@
/* The bitmask for the EPC section type. */
#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
+enum sgx_encls_function {
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+ EAUG = 0x0D,
+ EMODPR = 0x0E,
+ EMODT = 0x0F,
+};
+
/**
* enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
* %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
* been completed yet.
+ * %SGX_CHILD_PRESENT SECS has child pages present in the EPC.
* %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
* public key does not match IA32_SGXLEPUBKEYHASH.
* %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
*/
enum sgx_return_code {
SGX_NOT_TRACKED = 11,
+ SGX_CHILD_PRESENT = 13,
SGX_INVALID_EINITTOKEN = 16,
SGX_UNMASKED_EVENT = 128,
};
@@ -335,4 +360,19 @@ struct sgx_sigstruct {
#define SGX_LAUNCH_TOKEN_SIZE 304
-#endif /* _ASM_X86_SGX_ARCH_H */
+/*
+ * Do not put any hardware-defined SGX structure representations below this
+ * comment!
+ */
+
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd);
+
+#endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1c561945b426..772e60efe243 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -269,7 +269,9 @@ struct vmcb_save_area {
* SEV-ES guests when referenced through the GHCB or for
* saving to the host save area.
*/
- u8 reserved_7[80];
+ u8 reserved_7[72];
+ u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
+ u8 reserved_7b[4];
u32 pkru;
u8 reserved_7a[20];
u64 reserved_8; /* rax already available at 0x01f8 */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 358707f60d99..0ffaa3156a4e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -373,6 +373,7 @@ enum vmcs_field {
#define GUEST_INTR_STATE_MOV_SS 0x00000002
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR 0x00000010
/* GUEST_ACTIVITY_STATE flags */
#define GUEST_ACTIVITY_ACTIVE 0
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b8e650a985e3..946d761adbd3 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -27,6 +27,7 @@
#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000
#define EXIT_REASON_EXCEPTION_NMI 0
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 42af31b64c2c..defda61f372d 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
{ X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
+ { X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX1, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
{}
};
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 3b1b01f2b248..da696eb4821a 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
}
#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
-static void clear_sgx_caps(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_SGX);
- setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
static int __init nosgx(char *str)
{
- clear_sgx_caps();
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
return 0;
}
@@ -110,23 +104,30 @@ early_param("nosgx", nosgx);
void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
{
+ bool enable_sgx_kvm = false, enable_sgx_driver = false;
bool tboot = tboot_enabled();
- bool enable_sgx;
+ bool enable_vmx;
u64 msr;
if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
- clear_sgx_caps();
+ clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
- /*
- * Enable SGX if and only if the kernel supports SGX and Launch Control
- * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
- */
- enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
- cpu_has(c, X86_FEATURE_SGX_LC) &&
- IS_ENABLED(CONFIG_X86_SGX);
+ enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+ IS_ENABLED(CONFIG_KVM_INTEL);
+
+ if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+ /*
+ * Separate out SGX driver enabling from KVM. This allows KVM
+ * guests to use SGX even if the kernel SGX driver refuses to
+ * use it. This happens if flexible Launch Control is not
+ * available.
+ */
+ enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+ enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+ }
if (msr & FEAT_CTL_LOCKED)
goto update_caps;
@@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
* i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
* for the kernel, e.g. using VMX to hide malicious code.
*/
- if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ if (enable_vmx) {
msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
if (tboot)
msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
}
- if (enable_sgx)
- msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+ if (enable_sgx_kvm || enable_sgx_driver) {
+ msr |= FEAT_CTL_SGX_ENABLED;
+ if (enable_sgx_driver)
+ msr |= FEAT_CTL_SGX_LC_ENABLED;
+ }
wrmsrl(MSR_IA32_FEAT_CTL, msr);
@@ -173,10 +177,29 @@ update_caps:
}
update_sgx:
- if (!(msr & FEAT_CTL_SGX_ENABLED) ||
- !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
- if (enable_sgx)
- pr_err_once("SGX disabled by BIOS\n");
- clear_sgx_caps();
+ if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+ if (enable_sgx_kvm || enable_sgx_driver)
+ pr_err_once("SGX disabled by BIOS.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ /*
+ * VMX feature bit may be cleared due to being disabled in BIOS,
+ * in which case SGX virtualization cannot be supported either.
+ */
+ if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+ pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+ enable_sgx_kvm = 0;
+ }
+
+ if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+ if (!enable_sgx_kvm) {
+ pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ } else {
+ pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+ }
}
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 972ec3bfa9c0..21d1f062895a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 91d3dc784a29..9c1656779b2a 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -3,3 +3,4 @@ obj-y += \
encl.o \
ioctl.o \
main.o
+obj-$(CONFIG_X86_SGX_KVM) += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 8ce6d8371cfb..aa9b8b868867 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = {
.get_unmapped_area = sgx_get_unmapped_area,
};
-const struct file_operations sgx_provision_fops = {
- .owner = THIS_MODULE,
-};
-
static struct miscdevice sgx_dev_enclave = {
.minor = MISC_DYNAMIC_MINOR,
.name = "sgx_enclave",
@@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = {
.fops = &sgx_encl_fops,
};
-static struct miscdevice sgx_dev_provision = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "sgx_provision",
- .nodename = "sgx_provision",
- .fops = &sgx_provision_fops,
-};
-
int __init sgx_drv_init(void)
{
unsigned int eax, ebx, ecx, edx;
@@ -187,11 +176,5 @@ int __init sgx_drv_init(void)
if (ret)
return ret;
- ret = misc_register(&sgx_dev_provision);
- if (ret) {
- misc_deregister(&sgx_dev_enclave);
- return ret;
- }
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 7449ef33f081..3be203297988 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -7,7 +7,7 @@
#include <linux/shmem_fs.h>
#include <linux/suspend.h>
#include <linux/sched/mm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#include "encl.h"
#include "encls.h"
#include "sgx.h"
@@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
if (ret) {
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(ret);
}
@@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref)
if (sgx_unmark_page_reclaimable(entry->epc_page))
continue;
- sgx_free_epc_page(entry->epc_page);
+ sgx_encl_free_epc_page(entry->epc_page);
encl->secs_child_cnt--;
entry->epc_page = NULL;
}
@@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref)
xa_destroy(&encl->page_array);
if (!encl->secs_child_cnt && encl->secs.epc_page) {
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
}
@@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref)
va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
list);
list_del(&va_page->list);
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
kfree(va_page);
}
@@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void)
ret = __epa(sgx_get_epc_virt_addr(epc_page));
if (ret) {
WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(-EFAULT);
}
@@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page)
return slot == SGX_VA_SLOT_COUNT;
}
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page: EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list. Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+ return;
+
+ sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index d8d30ccbef4c..6e74f85b6264 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void);
unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 443188fe7e70..9b204843b78d 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -11,21 +11,6 @@
#include <asm/traps.h>
#include "sgx.h"
-enum sgx_encls_function {
- ECREATE = 0x00,
- EADD = 0x01,
- EINIT = 0x02,
- EREMOVE = 0x03,
- EDGBRD = 0x04,
- EDGBWR = 0x05,
- EEXTEND = 0x06,
- ELDU = 0x08,
- EBLOCK = 0x09,
- EPA = 0x0A,
- EWB = 0x0B,
- ETRACK = 0x0C,
-};
-
/**
* ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
*
@@ -55,6 +40,19 @@ enum sgx_encls_function {
} while (0); \
}
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret: the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true: ENCLS leaf faulted.
+ * - false: Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+ return ret & ENCLS_FAULT_FLAG;
+}
+
/**
* encls_failed() - Check if an ENCLS function failed
* @ret: the return value of an ENCLS function call
@@ -65,7 +63,7 @@ enum sgx_encls_function {
*/
static inline bool encls_failed(int ret)
{
- if (ret & ENCLS_FAULT_FLAG)
+ if (encls_faulted(ret))
return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
return !!ret;
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 90a5caf76939..83df20e3e633 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -2,6 +2,7 @@
/* Copyright(c) 2016-20 Intel Corporation. */
#include <asm/mman.h>
+#include <asm/sgx.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
encl->page_cnt--;
if (va_page) {
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
list_del(&va_page->list);
kfree(va_page);
}
@@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
return 0;
err_out:
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
err_out_backing:
@@ -365,7 +366,7 @@ err_out_unlock:
mmap_read_unlock(current->mm);
err_out_free:
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
kfree(encl_page);
return ret;
@@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
void *token)
{
u64 mrsigner[4];
- int i, j, k;
+ int i, j;
void *addr;
int ret;
@@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
preempt_disable();
- for (k = 0; k < 4; k++)
- wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+ sgx_update_lepubkeyhash(mrsigner);
ret = __einit(sigstruct, token, addr);
@@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
}
}
- if (ret & ENCLS_FAULT_FLAG) {
+ if (encls_faulted(ret)) {
if (encls_failed(ret))
ENCLS_WARN(ret, "EINIT");
@@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
{
struct sgx_sigstruct *sigstruct;
struct sgx_enclave_init init_arg;
- struct page *initp_page;
void *token;
int ret;
@@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
return -EFAULT;
- initp_page = alloc_page(GFP_KERNEL);
- if (!initp_page)
+ /*
+ * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+ * boundary. kmalloc() will give this alignment when allocating
+ * PAGE_SIZE bytes.
+ */
+ sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sigstruct)
return -ENOMEM;
- sigstruct = kmap(initp_page);
token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
@@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
ret = sgx_encl_init(encl, sigstruct, token);
out:
- kunmap(initp_page);
- __free_page(initp_page);
+ kfree(sigstruct);
return ret;
}
@@ -665,24 +667,11 @@ out:
static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
{
struct sgx_enclave_provision params;
- struct file *file;
if (copy_from_user(&params, arg, sizeof(params)))
return -EFAULT;
- file = fget(params.fd);
- if (!file)
- return -EINVAL;
-
- if (file->f_op != &sgx_provision_fops) {
- fput(file);
- return -EINVAL;
- }
-
- encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
- fput(file);
- return 0;
+ return sgx_set_attribute(&encl->attributes_mask, params.fd);
}
long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8df81a3ed945..ad904747419e 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1,14 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2016-20 Intel Corporation. */
+#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/miscdevice.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
#include "encls.h"
@@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
* with sgx_reclaimer_lock acquired.
*/
static LIST_HEAD(sgx_active_page_list);
-
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+/* The free page list lock protected variables prepend the lock. */
+static unsigned long sgx_nr_free_pages;
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node. Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
/*
- * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
- * pages whose child pages blocked EREMOVE.
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
*/
-static void sgx_sanitize_section(struct sgx_epc_section *section)
+static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
struct sgx_epc_page *page;
LIST_HEAD(dirty);
int ret;
- /* init_laundry_list is thread-local, no need for a lock: */
- while (!list_empty(&section->init_laundry_list)) {
+ /* dirty_page_list is thread-local, no need for a lock: */
+ while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
return;
- /* needed for access to ->page_list: */
- spin_lock(&section->lock);
-
- page = list_first_entry(&section->init_laundry_list,
- struct sgx_epc_page, list);
+ page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
ret = __eremove(sgx_get_epc_virt_addr(page));
- if (!ret)
- list_move(&page->list, &section->page_list);
- else
+ if (!ret) {
+ /*
+ * page is now sanitized. Make it available via the SGX
+ * page allocator:
+ */
+ list_del(&page->list);
+ sgx_free_epc_page(page);
+ } else {
+ /* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
-
- spin_unlock(&section->lock);
+ }
cond_resched();
}
- list_splice(&dirty, &section->init_laundry_list);
+ list_splice(&dirty, dirty_page_list);
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
sgx_encl_put_backing(&secs_backing, true);
@@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void)
struct sgx_epc_section *section;
struct sgx_encl_page *encl_page;
struct sgx_epc_page *epc_page;
+ struct sgx_numa_node *node;
pgoff_t page_index;
int cnt = 0;
int ret;
@@ -379,50 +399,33 @@ skip:
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
section = &sgx_epc_sections[epc_page->section];
- spin_lock(&section->lock);
- list_add_tail(&epc_page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
- }
-}
-
-static unsigned long sgx_nr_free_pages(void)
-{
- unsigned long cnt = 0;
- int i;
-
- for (i = 0; i < sgx_nr_epc_sections; i++)
- cnt += sgx_epc_sections[i].free_cnt;
+ node = section->node;
- return cnt;
+ spin_lock(&node->lock);
+ list_add_tail(&epc_page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
+ spin_unlock(&node->lock);
+ }
}
static bool sgx_should_reclaim(unsigned long watermark)
{
- return sgx_nr_free_pages() < watermark &&
- !list_empty(&sgx_active_page_list);
+ return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
}
static int ksgxd(void *p)
{
- int i;
-
set_freezable();
/*
* Sanitize pages in order to recover from kexec(). The 2nd pass is
* required for SECS pages, whose child pages blocked EREMOVE.
*/
- for (i = 0; i < sgx_nr_epc_sections; i++)
- sgx_sanitize_section(&sgx_epc_sections[i]);
-
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- sgx_sanitize_section(&sgx_epc_sections[i]);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
- /* Should never happen. */
- if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
- WARN(1, "EPC section %d has unsanitized pages.\n", i);
- }
+ /* sanity check: */
+ WARN_ON(!list_empty(&sgx_dirty_page_list));
while (!kthread_should_stop()) {
if (try_to_freeze())
@@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void)
return true;
}
-static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
{
- struct sgx_epc_page *page;
+ struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+ struct sgx_epc_page *page = NULL;
- spin_lock(&section->lock);
+ spin_lock(&node->lock);
- if (list_empty(&section->page_list)) {
- spin_unlock(&section->lock);
+ if (list_empty(&node->free_page_list)) {
+ spin_unlock(&node->lock);
return NULL;
}
- page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
+ page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
- section->free_cnt--;
+ sgx_nr_free_pages--;
+
+ spin_unlock(&node->lock);
- spin_unlock(&section->lock);
return page;
}
/**
* __sgx_alloc_epc_page() - Allocate an EPC page
*
- * Iterate through EPC sections and borrow a free EPC page to the caller. When a
- * page is no longer needed it must be released with sgx_free_epc_page().
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
*
* Return:
- * an EPC page,
- * -errno on error
+ * - an EPC page: A borrowed EPC pages were available.
+ * - NULL: Out of EPC pages.
*/
struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
- struct sgx_epc_section *section;
struct sgx_epc_page *page;
- int i;
+ int nid_of_current = numa_node_id();
+ int nid = nid_of_current;
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- section = &sgx_epc_sections[i];
+ if (node_isset(nid_of_current, sgx_numa_mask)) {
+ page = __sgx_alloc_epc_page_from_node(nid_of_current);
+ if (page)
+ return page;
+ }
+
+ /* Fall back to the non-local NUMA nodes: */
+ while (true) {
+ nid = next_node_in(nid, sgx_numa_mask);
+ if (nid == nid_of_current)
+ break;
- page = __sgx_alloc_epc_page_from_section(section);
+ page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
}
@@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
* sgx_free_epc_page() - Free an EPC page
* @page: an EPC page
*
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
*/
void sgx_free_epc_page(struct sgx_epc_page *page)
{
struct sgx_epc_section *section = &sgx_epc_sections[page->section];
- int ret;
+ struct sgx_numa_node *node = section->node;
- WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+ spin_lock(&node->lock);
- ret = __eremove(sgx_get_epc_virt_addr(page));
- if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
- return;
+ list_add_tail(&page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
- spin_lock(&section->lock);
- list_add_tail(&page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
+ spin_unlock(&node->lock);
}
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
@@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
- spin_lock_init(&section->lock);
- INIT_LIST_HEAD(&section->page_list);
- INIT_LIST_HEAD(&section->init_laundry_list);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
- list_add_tail(&section->pages[i].list, &section->init_laundry_list);
+ list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
}
- section->free_cnt = nr_pages;
return true;
}
@@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void)
{
u32 eax, ebx, ecx, edx, type;
u64 pa, size;
+ int nid;
int i;
+ sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+ if (!sgx_numa_nodes)
+ return false;
+
for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
@@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void)
break;
}
+ nid = numa_map_to_online_node(phys_to_target_node(pa));
+ if (nid == NUMA_NO_NODE) {
+ /* The physical address is already printed above. */
+ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+ nid = 0;
+ }
+
+ if (!node_isset(nid, sgx_numa_mask)) {
+ spin_lock_init(&sgx_numa_nodes[nid].lock);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ node_set(nid, sgx_numa_mask);
+ }
+
+ sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+
sgx_nr_epc_sections++;
}
@@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void)
return true;
}
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+ int i;
+
+ WARN_ON_ONCE(preemptible());
+
+ for (i = 0; i < 4; i++)
+ wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes: Pointer to allowed enclave attributes
+ * @attribute_fd: File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL: Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd)
+{
+ struct file *file;
+
+ file = fget(attribute_fd);
+ if (!file)
+ return -EINVAL;
+
+ if (file->f_op != &sgx_provision_fops) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+ fput(file);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
static int __init sgx_init(void)
{
int ret;
@@ -716,12 +806,28 @@ static int __init sgx_init(void)
goto err_page_cache;
}
- ret = sgx_drv_init();
+ ret = misc_register(&sgx_dev_provision);
if (ret)
goto err_kthread;
+ /*
+ * Always try to initialize the native *and* KVM drivers.
+ * The KVM driver is less picky than the native one and
+ * can function if the native one is not supported on the
+ * current system or fails to initialize.
+ *
+ * Error out only if both fail to initialize.
+ */
+ ret = sgx_drv_init();
+
+ if (sgx_vepc_init() && ret)
+ goto err_provision;
+
return 0;
+err_provision:
+ misc_deregister(&sgx_dev_provision);
+
err_kthread:
kthread_stop(ksgxd_tsk);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 5fa42d143feb..4628acec0009 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -8,11 +8,15 @@
#include <linux/rwsem.h>
#include <linux/types.h>
#include <asm/asm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#undef pr_fmt
#define pr_fmt(fmt) "sgx: " fmt
+#define EREMOVE_ERROR_MESSAGE \
+ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+ "Refer to Documentation/x86/sgx.rst for more information."
+
#define SGX_MAX_EPC_SECTIONS 8
#define SGX_EEXTEND_BLOCK_SIZE 256
#define SGX_NR_TO_SCAN 16
@@ -30,28 +34,25 @@ struct sgx_epc_page {
};
/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+ struct list_head free_page_list;
+ spinlock_t lock;
+};
+
+/*
* The firmware can define multiple chunks of EPC to the different areas of the
* physical memory e.g. for memory areas of the each node. This structure is
* used to store EPC pages for one EPC section and virtual memory area where
* the pages have been mapped.
- *
- * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
*/
struct sgx_epc_section {
unsigned long phys_addr;
void *virt_addr;
struct sgx_epc_page *pages;
-
- spinlock_t lock;
- struct list_head page_list;
- unsigned long free_cnt;
-
- /*
- * Pages which need EREMOVE run on them before they can be
- * used. Only safe to be accessed in ksgxd and init code.
- * Not protected by locks.
- */
- struct list_head init_laundry_list;
+ struct sgx_numa_node *node;
};
extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
@@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..6ad165a5c0cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+ struct xarray page_array;
+ struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct sgx_epc_page *epc_page;
+ unsigned long index, pfn;
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&vepc->lock));
+
+ /* Calculate index of EPC page in virtual EPC's page_array */
+ index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+ epc_page = xa_load(&vepc->page_array, index);
+ if (epc_page)
+ return 0;
+
+ epc_page = sgx_alloc_epc_page(vepc, false);
+ if (IS_ERR(epc_page))
+ return PTR_ERR(epc_page);
+
+ ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+ if (ret)
+ goto err_free;
+
+ pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+ ret = vmf_insert_pfn(vma, addr, pfn);
+ if (ret != VM_FAULT_NOPAGE) {
+ ret = -EFAULT;
+ goto err_delete;
+ }
+
+ return 0;
+
+err_delete:
+ xa_erase(&vepc->page_array, index);
+err_free:
+ sgx_free_epc_page(epc_page);
+ return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_vepc *vepc = vma->vm_private_data;
+ int ret;
+
+ mutex_lock(&vepc->lock);
+ ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+ mutex_unlock(&vepc->lock);
+
+ if (!ret)
+ return VM_FAULT_NOPAGE;
+
+ if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ return VM_FAULT_RETRY;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+ .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &sgx_vepc_vm_ops;
+ /* Don't copy VMA in fork() */
+ vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+ vma->vm_private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret;
+
+ /*
+ * Take a previously guest-owned EPC page and return it to the
+ * general EPC page pool.
+ *
+ * Guests can not be trusted to have left this page in a good
+ * state, so run EREMOVE on the page unconditionally. In the
+ * case that a guest properly EREMOVE'd this page, a superfluous
+ * EREMOVE is harmless.
+ */
+ ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ /*
+ * Only SGX_CHILD_PRESENT is expected, which is because of
+ * EREMOVE'ing an SECS still with child, in which case it can
+ * be handled by EREMOVE'ing the SECS again after all pages in
+ * virtual EPC have been EREMOVE'd. See comments in below in
+ * sgx_vepc_release().
+ *
+ * The user of virtual EPC (KVM) needs to guarantee there's no
+ * logical processor is still running in the enclave in guest,
+ * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+ * handled here.
+ */
+ WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+ ret, ret);
+ return ret;
+ }
+
+ sgx_free_epc_page(epc_page);
+
+ return 0;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc = file->private_data;
+ struct sgx_epc_page *epc_page, *tmp, *entry;
+ unsigned long index;
+
+ LIST_HEAD(secs_pages);
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ /*
+ * Remove all normal, child pages. sgx_vepc_free_page()
+ * will fail if EREMOVE fails, but this is OK and expected on
+ * SECS pages. Those can only be EREMOVE'd *after* all their
+ * child pages. Retries below will clean them up.
+ */
+ if (sgx_vepc_free_page(entry))
+ continue;
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * Retry EREMOVE'ing pages. This will clean up any SECS pages that
+ * only had children in this 'epc' area.
+ */
+ xa_for_each(&vepc->page_array, index, entry) {
+ epc_page = entry;
+ /*
+ * An EREMOVE failure here means that the SECS page still
+ * has children. But, since all children in this 'sgx_vepc'
+ * have been removed, the SECS page must have a child on
+ * another instance.
+ */
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * SECS pages are "pinned" by child pages, and "unpinned" once all
+ * children have been EREMOVE'd. A child page in this instance
+ * may have pinned an SECS page encountered in an earlier release(),
+ * creating a zombie. Since some children were EREMOVE'd above,
+ * try to EREMOVE all zombies in the hopes that one was unpinned.
+ */
+ mutex_lock(&zombie_secs_pages_lock);
+ list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+ /*
+ * Speculatively remove the page from the list of zombies,
+ * if the page is successfully EREMOVE'd it will be added to
+ * the list of free pages. If EREMOVE fails, throw the page
+ * on the local list, which will be spliced on at the end.
+ */
+ list_del(&epc_page->list);
+
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+ }
+
+ if (!list_empty(&secs_pages))
+ list_splice_tail(&secs_pages, &zombie_secs_pages);
+ mutex_unlock(&zombie_secs_pages_lock);
+
+ kfree(vepc);
+
+ return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc;
+
+ vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+ if (!vepc)
+ return -ENOMEM;
+ mutex_init(&vepc->lock);
+ xa_init(&vepc->page_array);
+
+ file->private_data = vepc;
+
+ return 0;
+}
+
+static const struct file_operations sgx_vepc_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_vepc_open,
+ .release = sgx_vepc_release,
+ .mmap = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_vepc",
+ .nodename = "sgx_vepc",
+ .fops = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+ /* SGX virtualization requires KVM to work */
+ if (!cpu_feature_enabled(X86_FEATURE_VMX))
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&zombie_secs_pages);
+ mutex_init(&zombie_secs_pages_lock);
+
+ return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo: Pointer to PAGEINFO structure
+ * @secs: Userspace pointer to SECS page
+ * @trapnr: trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * - 0: ECREATE was successful.
+ * - <0: on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr)
+{
+ int ret;
+
+ /*
+ * @secs is an untrusted, userspace-provided address. It comes from
+ * KVM and is assumed to be a valid pointer which points somewhere in
+ * userspace. This can fault and call SGX or other fault handlers when
+ * userspace mapping @secs doesn't exist.
+ *
+ * Add a WARN() to make sure @secs is already valid userspace pointer
+ * from caller (KVM), who should already have handled invalid pointer
+ * case (for instance, made by malicious guest). All other checks,
+ * such as alignment of @secs, are deferred to ENCLS itself.
+ */
+ if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __ecreate(pageinfo, (void *)secs);
+ __uaccess_end();
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ /* ECREATE doesn't return an error code, it faults or succeeds. */
+ WARN_ON_ONCE(ret);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs)
+{
+ int ret;
+
+ /*
+ * Make sure all userspace pointers from caller (KVM) are valid.
+ * All other checks deferred to ENCLS itself. Also see comment
+ * for @secs in sgx_virt_ecreate().
+ */
+#define SGX_EINITTOKEN_SIZE 304
+ if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+ !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+ !access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+ __uaccess_end();
+
+ return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct: Userspace pointer to SIGSTRUCT structure
+ * @token: Userspace pointer to EINITTOKEN structure
+ * @secs: Userspace pointer to SECS page
+ * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr: trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * - 0: EINIT was successful.
+ * - <0: on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ } else {
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(lepubkeyhash);
+
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ preempt_enable();
+ }
+
+ /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+ if (ret == -EINVAL)
+ return ret;
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 78bb0fae3982..bd01a6131edf 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -451,6 +451,10 @@ static void __init sev_map_percpu_data(void)
}
}
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
static bool pv_tlb_flush_supported(void)
{
return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
@@ -458,10 +462,6 @@ static bool pv_tlb_flush_supported(void)
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
}
-static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
-
-#ifdef CONFIG_SMP
-
static bool pv_ipi_supported(void)
{
return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
@@ -574,6 +574,49 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
}
}
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ u8 state;
+ int cpu;
+ struct kvm_steal_time *src;
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+ cpumask_copy(flushmask, cpumask);
+ /*
+ * We have to call flush only on online vCPUs. And
+ * queue flush_on_enter for pre-empted vCPUs
+ */
+ for_each_cpu(cpu, flushmask) {
+ src = &per_cpu(steal_time, cpu);
+ state = READ_ONCE(src->preempted);
+ if ((state & KVM_VCPU_PREEMPTED)) {
+ if (try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_FLUSH_TLB))
+ __cpumask_clear_cpu(cpu, flushmask);
+ }
+ }
+
+ native_flush_tlb_others(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+ int cpu;
+
+ if (!kvm_para_available() || nopv)
+ return 0;
+
+ if (pv_tlb_flush_supported() || pv_ipi_supported())
+ for_each_possible_cpu(cpu) {
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ }
+
+ return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
static void __init kvm_smp_prepare_boot_cpu(void)
{
/*
@@ -611,33 +654,8 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
local_irq_enable();
return 0;
}
-#endif
-
-static void kvm_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
-{
- u8 state;
- int cpu;
- struct kvm_steal_time *src;
- struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
-
- cpumask_copy(flushmask, cpumask);
- /*
- * We have to call flush only on online vCPUs. And
- * queue flush_on_enter for pre-empted vCPUs
- */
- for_each_cpu(cpu, flushmask) {
- src = &per_cpu(steal_time, cpu);
- state = READ_ONCE(src->preempted);
- if ((state & KVM_VCPU_PREEMPTED)) {
- if (try_cmpxchg(&src->preempted, &state,
- state | KVM_VCPU_FLUSH_TLB))
- __cpumask_clear_cpu(cpu, flushmask);
- }
- }
- native_flush_tlb_others(flushmask, info);
-}
+#endif
static void __init kvm_guest_init(void)
{
@@ -653,12 +671,6 @@ static void __init kvm_guest_init(void)
pv_ops.time.steal_clock = kvm_steal_clock;
}
- if (pv_tlb_flush_supported()) {
- pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
- pr_info("KVM setup pv remote TLB flush\n");
- }
-
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
@@ -668,6 +680,12 @@ static void __init kvm_guest_init(void)
}
#ifdef CONFIG_SMP
+ if (pv_tlb_flush_supported()) {
+ pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
+ pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ pr_info("KVM setup pv remote TLB flush\n");
+ }
+
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (pv_sched_yield_supported()) {
smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
@@ -734,7 +752,7 @@ static uint32_t __init kvm_detect(void)
static void __init kvm_apic_init(void)
{
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
if (pv_ipi_supported())
kvm_setup_pv_ipi();
#endif
@@ -794,32 +812,6 @@ static __init int activate_jump_labels(void)
}
arch_initcall(activate_jump_labels);
-static __init int kvm_alloc_cpumask(void)
-{
- int cpu;
- bool alloc = false;
-
- if (!kvm_para_available() || nopv)
- return 0;
-
- if (pv_tlb_flush_supported())
- alloc = true;
-
-#if defined(CONFIG_SMP)
- if (pv_ipi_supported())
- alloc = true;
-#endif
-
- if (alloc)
- for_each_possible_cpu(cpu) {
- zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
- GFP_KERNEL, cpu_to_node(cpu));
- }
-
- return 0;
-}
-arch_initcall(kvm_alloc_cpumask);
-
#ifdef CONFIG_PARAVIRT_SPINLOCKS
/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a788d5120d4d..f6b93a35ce14 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,6 +84,18 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config X86_SGX_KVM
+ bool "Software Guard eXtensions (SGX) Virtualization"
+ depends on X86_SGX && KVM_INTEL
+ help
+
+ Enables KVM guests to create SGX enclaves.
+
+ This includes support to expose "raw" unreclaimable enclave memory to
+ guests via a device node, e.g. /dev/sgx_vepc.
+
+ If unsure, say N.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index eafc4d601f25..c589db5d91b3 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -23,6 +23,8 @@ kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
+
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6bd2f8b830e4..2ae061586677 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -18,6 +18,7 @@
#include <asm/processor.h>
#include <asm/user.h>
#include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
#include "cpuid.h"
#include "lapic.h"
#include "mmu.h"
@@ -28,7 +29,7 @@
* Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
* aligned to sizeof(unsigned long) because it's not accessed via bitops.
*/
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
EXPORT_SYMBOL_GPL(kvm_cpu_caps);
static u32 xstate_required_size(u64 xstate_bv, bool compacted)
@@ -53,6 +54,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
}
#define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
@@ -170,6 +172,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.guest_supported_xcr0 =
(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+ /*
+ * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+ * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+ * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
+ * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+ * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
+ * '1' even on CPUs that don't support XSAVE.
+ */
+ best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
+ if (best) {
+ best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
+ best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
+ best->ecx |= XFEATURE_MASK_FPSSE;
+ }
+
kvm_update_pv_runtime(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -347,13 +364,13 @@ out:
return r;
}
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
{
const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
struct kvm_cpuid_entry2 entry;
reverse_cpuid_check(leaf);
- kvm_cpu_caps[leaf] &= mask;
cpuid_count(cpuid.function, cpuid.index,
&entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
@@ -361,6 +378,26 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
}
+static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+ BUILD_BUG_ON(leaf < NCAPINTS);
+
+ kvm_cpu_caps[leaf] = mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+ BUILD_BUG_ON(leaf >= NCAPINTS);
+
+ kvm_cpu_caps[leaf] &= mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
void kvm_set_cpu_caps(void)
{
unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
@@ -371,12 +408,13 @@ void kvm_set_cpu_caps(void)
unsigned int f_gbpages = 0;
unsigned int f_lm = 0;
#endif
+ memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
- BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+ BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
sizeof(boot_cpu_data.x86_capability));
memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
- sizeof(kvm_cpu_caps));
+ sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
kvm_cpu_cap_mask(CPUID_1_ECX,
/*
@@ -407,7 +445,7 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_mask(CPUID_7_0_EBX,
- F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+ F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
@@ -418,7 +456,8 @@ void kvm_set_cpu_caps(void)
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+ F(SGX_LC)
);
/* Set LA57 based on hardware capability. */
if (cpuid_ecx(7) & F(LA57))
@@ -457,6 +496,10 @@ void kvm_set_cpu_caps(void)
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
);
+ kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+ SF(SGX1) | SF(SGX2)
+ );
+
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -778,6 +821,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = 0;
}
break;
+ case 0x12:
+ /* Intel SGX */
+ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+ * and max enclave sizes. The SGX sub-features and MISCSELECT
+ * are restricted by kernel and KVM capabilities (like most
+ * feature flags), while enclave size is unrestricted.
+ */
+ cpuid_entry_override(entry, CPUID_12_EAX);
+ entry->ebx &= SGX_MISC_EXINFO;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ /*
+ * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la
+ * feature flags. Advertise all supported flags, including
+ * privileged attributes that require explicit opt-in from
+ * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
+ * expected to derive it from supported XCR0.
+ */
+ entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
+ SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
+ SGX_ATTR_KSS;
+ entry->ebx &= 0;
+ break;
/* Intel PT */
case 0x14:
if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 2a0c5064497f..888e88b42e8d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -7,7 +7,25 @@
#include <asm/processor.h>
#include <uapi/asm/kvm_para.h>
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM. Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+ CPUID_12_EAX = NCAPINTS,
+ NR_KVM_CPU_CAPS,
+
+ NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
@@ -80,6 +98,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
[CPUID_7_EDX] = { 7, 0, CPUID_EDX},
[CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
+ [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
};
/*
@@ -101,6 +120,25 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
}
/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+ if (x86_feature == X86_FEATURE_SGX1)
+ return KVM_X86_FEATURE_SGX1;
+ else if (x86_feature == X86_FEATURE_SGX2)
+ return KVM_X86_FEATURE_SGX2;
+
+ return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+ return __feature_translate(x86_feature) / 32;
+}
+
+/*
* Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
* the hardware defined bit number (stored in bits 4:0) and a software defined
* "word" (stored in bits 31:5). The word is used to index into arrays of
@@ -108,6 +146,8 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
*/
static __always_inline u32 __feature_bit(int x86_feature)
{
+ x86_feature = __feature_translate(x86_feature);
+
reverse_cpuid_check(x86_feature / 32);
return 1 << (x86_feature & 31);
}
@@ -116,7 +156,7 @@ static __always_inline u32 __feature_bit(int x86_feature)
static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
return reverse_cpuid[x86_leaf];
@@ -248,6 +288,14 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
}
+static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
+}
+
static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -308,7 +356,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
@@ -316,7 +364,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
@@ -324,7 +372,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cc369b9ad8f1..0050f39e90d9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2869,7 +2869,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
return;
if (is_guest_mode(vcpu)) {
- r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ r = kvm_check_nested_events(vcpu);
if (r < 0)
return;
/*
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c68bfc3e2402..88d0ed5225a4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -59,7 +59,8 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
@@ -73,6 +74,10 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
@@ -102,8 +107,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(root_hpa))
return;
- static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
- vcpu->arch.mmu->shadow_root_level);
+ static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->shadow_root_level);
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -124,7 +129,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* write-protects guest page to sync the guest modification, b) another one is
* used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
* between these two sorts are:
- * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 1) the first case clears MMU-writable bit.
* 2) the first case requires flushing tlb immediately avoiding corrupting
* shadow page table between all vcpus so it should be in the protection of
* mmu-lock. And the another case does not need to flush tlb until returning
@@ -135,17 +140,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* So, there is the problem: the first case can meet the corrupted tlb caused
* by another case which write-protects pages but without flush tlb
* immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
- * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ * it flush tlb if we try to write-protect a spte whose MMU-writable bit
+ * is set, it works since another case never touches MMU-writable bit.
*
* Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * changed) we need to check whether the spte with MMU-writable becomes
* readonly, if that happens, we need to flush tlb. Fortunately,
* mmu_spte_update() has already handled it perfectly.
*
- * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * The rules to use MMU-writable and PT_WRITABLE_MASK:
* - if we want to see if it has writable tlb entry or if the spte can be
- * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ * writable on the mmu mapping, check MMU-writable, this is the most
* case, otherwise
* - if we fix page fault on the spte or do write-protection by dirty logging,
* check PT_WRITABLE_MASK.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 486aa94ecf1d..930ac8a7e7c9 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -48,6 +48,7 @@
#include <asm/memtype.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
+#include <asm/set_memory.h>
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
#include "trace.h"
@@ -215,10 +216,10 @@ bool is_nx_huge_page_enabled(void)
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned int access)
{
- u64 mask = make_mmio_spte(vcpu, gfn, access);
+ u64 spte = make_mmio_spte(vcpu, gfn, access);
- trace_mark_mmio_spte(sptep, gfn, mask);
- mmu_spte_set(sptep, mask);
+ trace_mark_mmio_spte(sptep, gfn, spte);
+ mmu_spte_set(sptep, spte);
}
static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -236,17 +237,6 @@ static unsigned get_mmio_spte_access(u64 spte)
return spte & shadow_mmio_access_mask;
}
-static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- kvm_pfn_t pfn, unsigned int access)
-{
- if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(vcpu, sptep, gfn, access);
- return true;
- }
-
- return false;
-}
-
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
{
u64 kvm_gen, spte_gen, gen;
@@ -725,8 +715,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
* handling slots that are not large page aligned.
*/
static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+ const struct kvm_memory_slot *slot, int level)
{
unsigned long idx;
@@ -1118,7 +1107,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
rmap_printk("spte %p %llx\n", sptep, *sptep);
if (pt_protect)
- spte &= ~SPTE_MMU_WRITEABLE;
+ spte &= ~shadow_mmu_writable_mask;
spte = spte & ~PT_WRITABLE_MASK;
return mmu_spte_update(sptep, spte);
@@ -1308,26 +1297,25 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
return flush;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
return kvm_zap_rmapp(kvm, rmap_head, slot);
}
-static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t pte)
{
u64 *sptep;
struct rmap_iterator iter;
int need_flush = 0;
u64 new_spte;
- pte_t *ptep = (pte_t *)data;
kvm_pfn_t new_pfn;
- WARN_ON(pte_huge(*ptep));
- new_pfn = pte_pfn(*ptep);
+ WARN_ON(pte_huge(pte));
+ new_pfn = pte_pfn(pte);
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
@@ -1336,7 +1324,7 @@ restart:
need_flush = 1;
- if (pte_write(*ptep)) {
+ if (pte_write(pte)) {
pte_list_remove(rmap_head, sptep);
goto restart;
} else {
@@ -1424,93 +1412,52 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-static __always_inline int
-kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn,
- int level,
- unsigned long data))
+typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t pte);
+
+static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ rmap_handler_t handler)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
struct slot_rmap_walk_iterator iterator;
- int ret = 0;
- int i;
-
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
+ bool ret = false;
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- gfn_start, gfn_end - 1,
- &iterator)
- ret |= handler(kvm, iterator.rmap, memslot,
- iterator.gfn, iterator.level, data);
- }
- }
+ for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ range->start, range->end - 1, &iterator)
+ ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
+ iterator.level, range->pte);
return ret;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn, int level,
- unsigned long data))
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
-{
- int r;
+ bool flush;
- r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+ flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
- r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
+ flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
- return r;
+ return flush;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int r;
+ bool flush;
- r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+ flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
if (is_tdp_mmu_enabled(kvm))
- r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
+ flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
- return r;
+ return flush;
}
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1519,13 +1466,12 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
for_each_rmap_spte(rmap_head, &iter, sptep)
young |= mmu_spte_age(sptep);
- trace_kvm_age_page(gfn, level, slot, young);
return young;
}
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, unsigned long data)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1547,29 +1493,31 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
- kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
KVM_PAGES_PER_HPAGE(sp->role.level));
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int young = false;
+ bool young;
+
+ young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
- young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
- young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
+ young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
return young;
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int young = false;
+ bool young;
+
+ young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
- young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
- young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
+ young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
return young;
}
@@ -2421,6 +2369,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
+ /*
+ * Note, this check is intentionally soft, it only guarantees that one
+ * page is available, while the caller may end up allocating as many as
+ * four pages, e.g. for PAE roots or for 5-level paging. Temporarily
+ * exceeding the (arbitrary by default) limit will not harm the host,
+ * being too agressive may unnecessarily kill the guest, and getting an
+ * exact count is far more trouble than it's worth, especially in the
+ * page fault paths.
+ */
if (!kvm_mmu_available_pages(vcpu->kvm))
return -ENOSPC;
return 0;
@@ -2561,9 +2518,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
struct kvm_mmu_page *sp;
int ret;
- if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
- return 0;
-
sp = sptep_to_sp(sptep);
ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
@@ -2593,6 +2547,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
+ if (unlikely(is_noslot_pfn(pfn))) {
+ mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+ return RET_PF_EMULATE;
+ }
+
if (is_shadow_present_pte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
@@ -2626,9 +2585,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
KVM_PAGES_PER_HPAGE(level));
- if (unlikely(is_mmio_spte(*sptep)))
- ret = RET_PF_EMULATE;
-
/*
* The fault is fully spurious if and only if the new SPTE and old SPTE
* are identical, and emulation is not required.
@@ -2745,7 +2701,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
}
static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
unsigned long hva;
pte_t *pte;
@@ -2771,8 +2727,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
return level;
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level)
{
struct kvm_lpage_info *linfo;
@@ -2946,9 +2903,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
return true;
}
- if (unlikely(is_noslot_pfn(pfn)))
+ if (unlikely(is_noslot_pfn(pfn))) {
vcpu_cache_mmio_info(vcpu, gva, gfn,
access & shadow_mmio_access_mask);
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop.
+ */
+ if (unlikely(!shadow_mmio_value)) {
+ *ret_val = RET_PF_EMULATE;
+ return true;
+ }
+ }
return false;
}
@@ -3061,6 +3028,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (!is_shadow_present_pte(spte))
break;
+ if (!is_shadow_present_pte(spte))
+ break;
+
sp = sptep_to_sp(iterator.sptep);
if (!is_last_spte(spte, sp->role.level))
break;
@@ -3150,12 +3120,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
- if (kvm_mmu_put_root(kvm, sp)) {
- if (is_tdp_mmu_page(sp))
- kvm_tdp_mmu_free_root(kvm, sp);
- else if (sp->role.invalid)
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
- }
+ if (is_tdp_mmu_page(sp))
+ kvm_tdp_mmu_put_root(kvm, sp, false);
+ else if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
*root_hpa = INVALID_PAGE;
}
@@ -3193,14 +3161,17 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
(mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
- } else {
- for (i = 0; i < 4; ++i)
- if (mmu->pae_root[i] != 0)
- mmu_free_root_page(kvm,
- &mmu->pae_root[i],
- &invalid_list);
- mmu->root_hpa = INVALID_PAGE;
+ } else if (mmu->pae_root) {
+ for (i = 0; i < 4; ++i) {
+ if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+ continue;
+
+ mmu_free_root_page(kvm, &mmu->pae_root[i],
+ &invalid_list);
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ }
}
+ mmu->root_hpa = INVALID_PAGE;
mmu->root_pgd = 0;
}
@@ -3226,155 +3197,208 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
{
struct kvm_mmu_page *sp;
- write_lock(&vcpu->kvm->mmu_lock);
-
- if (make_mmu_pages_available(vcpu)) {
- write_unlock(&vcpu->kvm->mmu_lock);
- return INVALID_PAGE;
- }
sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
++sp->root_count;
- write_unlock(&vcpu->kvm->mmu_lock);
return __pa(sp->spt);
}
static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
{
- u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u8 shadow_root_level = mmu->shadow_root_level;
hpa_t root;
unsigned i;
+ int r;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
-
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ mmu->root_hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
- root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
- true);
-
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+ mmu->root_hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
for (i = 0; i < 4; ++i) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, true);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
+ mmu->pae_root[i] = root | PT_PRESENT_MASK |
+ shadow_me_mask;
}
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
- } else
- BUG();
+ mmu->root_hpa = __pa(mmu->pae_root);
+ } else {
+ WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+ r = -EIO;
+ goto out_unlock;
+ }
/* root_pgd is ignored for direct MMUs. */
- vcpu->arch.mmu->root_pgd = 0;
-
- return 0;
+ mmu->root_pgd = 0;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
}
static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
- u64 pdptr, pm_mask;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 pdptrs[4], pm_mask;
gfn_t root_gfn, root_pgd;
hpa_t root;
- int i;
+ unsigned i;
+ int r;
- root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+ root_pgd = mmu->get_guest_pgd(vcpu);
root_gfn = root_pgd >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
return 1;
/*
+ * On SVM, reading PDPTRs might access guest memory, which might fault
+ * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
+ */
+ if (mmu->root_level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ pdptrs[i] = mmu->get_pdptr(vcpu, i);
+ if (!(pdptrs[i] & PT_PRESENT_MASK))
+ continue;
+
+ if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
+ return 1;
+ }
+ }
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
+ /*
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
+ if (mmu->root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
- vcpu->arch.mmu->shadow_root_level, false);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ mmu->shadow_root_level, false);
+ mmu->root_hpa = root;
goto set_root_pgd;
}
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
/*
* We shadow a 32 bit page table. This may be a legacy 2-level
* or a PAE 3-level page table. In either case we need to be aware that
* the shadow page table may be a PAE or a long mode page table.
*/
- pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ pm_mask = PT_PRESENT_MASK | shadow_me_mask;
+ if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+ if (WARN_ON_ONCE(!mmu->lm_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+ }
+
for (i = 0; i < 4; ++i) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
- if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
- pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
- if (!(pdptr & PT_PRESENT_MASK)) {
- vcpu->arch.mmu->pae_root[i] = 0;
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ if (mmu->root_level == PT32E_ROOT_LEVEL) {
+ if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
continue;
}
- root_gfn = pdptr >> PAGE_SHIFT;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
+ root_gfn = pdptrs[i] >> PAGE_SHIFT;
}
root = mmu_alloc_root(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, false);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->pae_root[i] = root | pm_mask;
+ mmu->pae_root[i] = root | pm_mask;
}
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
+
+ if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ mmu->root_hpa = __pa(mmu->lm_root);
+ else
+ mmu->root_hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+ mmu->root_pgd = root_pgd;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return 0;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 *lm_root, *pae_root;
/*
- * If we shadow a 32 bit page table with a long mode page
- * table we enter this path.
+ * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+ * tables are allocated and initialized at root creation as there is no
+ * equivalent level in the guest's NPT to shadow. Allocate the tables
+ * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
*/
- if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
- if (vcpu->arch.mmu->lm_root == NULL) {
- /*
- * The additional page necessary for this is only
- * allocated on demand.
- */
+ if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
+ mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+ return 0;
- u64 *lm_root;
+ /*
+ * This mess only works with 4-level paging and needs to be updated to
+ * work with 5-level paging.
+ */
+ if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
+ return -EIO;
- lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (lm_root == NULL)
- return 1;
+ if (mmu->pae_root && mmu->lm_root)
+ return 0;
- lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
+ /*
+ * The special roots should always be allocated in concert. Yell and
+ * bail if KVM ends up in a state where only one of the roots is valid.
+ */
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+ return -EIO;
- vcpu->arch.mmu->lm_root = lm_root;
- }
+ /*
+ * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+ * doesn't need to be decrypted.
+ */
+ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pae_root)
+ return -ENOMEM;
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
+ lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!lm_root) {
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
}
-set_root_pgd:
- vcpu->arch.mmu->root_pgd = root_pgd;
+ mmu->pae_root = pae_root;
+ mmu->lm_root = lm_root;
return 0;
}
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.mmu->direct_map)
- return mmu_alloc_direct_roots(vcpu);
- else
- return mmu_alloc_shadow_roots(vcpu);
-}
-
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
@@ -3422,7 +3446,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
- if (root && VALID_PAGE(root)) {
+ if (IS_VALID_PAE_ROOT(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = to_shadow_page(root);
mmu_sync_children(vcpu, sp);
@@ -3554,11 +3578,12 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
__is_rsvd_bits_set(rsvd_check, sptes[level], level);
if (reserved) {
- pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+ pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
__func__, addr);
for (level = root; level >= leaf; level--)
- pr_err("------ spte 0x%llx level %d.\n",
- sptes[level], level);
+ pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+ sptes[level], level,
+ rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]);
}
return reserved;
@@ -3653,6 +3678,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ */
+ if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
+ return true;
+
/* Don't expose private memslots to L2. */
if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
*pfn = KVM_PFN_NOSLOT;
@@ -4615,12 +4648,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
- context->shadow_root_level = new_role.base.level;
-
__kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
- if (new_role.as_u64 != context->mmu_role.as_u64)
+ if (new_role.as_u64 != context->mmu_role.as_u64) {
shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+
+ /*
+ * Override the level set by the common init helper, nested TDP
+ * always uses the host's TDP configuration.
+ */
+ context->shadow_root_level = new_role.base.level;
+ }
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
@@ -4802,16 +4840,23 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
if (r)
goto out;
- r = mmu_alloc_roots(vcpu);
- kvm_mmu_sync_roots(vcpu);
+ r = mmu_alloc_special_roots(vcpu);
+ if (r)
+ goto out;
+ if (vcpu->arch.mmu->direct_map)
+ r = mmu_alloc_direct_roots(vcpu);
+ else
+ r = mmu_alloc_shadow_roots(vcpu);
if (r)
goto out;
+
+ kvm_mmu_sync_roots(vcpu);
+
kvm_mmu_load_pgd(vcpu);
static_call(kvm_x86_tlb_flush_current)(vcpu);
out:
return r;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
@@ -4820,7 +4865,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
}
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
static bool need_remote_flush(u64 old, u64 new)
{
@@ -5169,10 +5213,10 @@ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_
static __always_inline bool
slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+ gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
+ bool flush)
{
struct slot_rmap_walk_iterator iterator;
- bool flush = false;
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
@@ -5180,7 +5224,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
flush |= fn(kvm, iterator.rmap, memslot);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- if (flush && lock_flush_tlb) {
+ if (flush && flush_on_yield) {
kvm_flush_remote_tlbs_with_address(kvm,
start_gfn,
iterator.gfn - start_gfn + 1);
@@ -5190,36 +5234,32 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
}
- if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
- end_gfn - start_gfn + 1);
- flush = false;
- }
-
return flush;
}
static __always_inline bool
slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- bool lock_flush_tlb)
+ bool flush_on_yield)
{
return slot_handle_level_range(kvm, memslot, fn, start_level,
end_level, memslot->base_gfn,
memslot->base_gfn + memslot->npages - 1,
- lock_flush_tlb);
+ flush_on_yield, false);
}
static __always_inline bool
slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ slot_level_handler fn, bool flush_on_yield)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
- PG_LEVEL_4K, lock_flush_tlb);
+ PG_LEVEL_4K, flush_on_yield);
}
static void free_mmu_pages(struct kvm_mmu *mmu)
{
+ if (!tdp_enabled && mmu->pae_root)
+ set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
free_page((unsigned long)mmu->lm_root);
}
@@ -5240,9 +5280,11 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
* while the PDP table is a per-vCPU construct that's allocated at MMU
* creation. When emulating 32-bit mode, cr3 is only 32 bits even on
* x86_64. Therefore we need to allocate the PDP table in the first
- * 4GB of memory, which happens to fit the DMA32 zone. Except for
- * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
- * skip allocating the PDP table.
+ * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
+ * generally doesn't use PAE paging and can skip allocating the PDP
+ * table. The main exception, handled here, is SVM's 32-bit NPT. The
+ * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+ * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
*/
if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
@@ -5252,8 +5294,22 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
return -ENOMEM;
mmu->pae_root = page_address(page);
+
+ /*
+ * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+ * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
+ * that KVM's writes and the CPU's reads get along. Note, this is
+ * only necessary when using shadow paging, as 64-bit NPT can get at
+ * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+ * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+ */
+ if (!tdp_enabled)
+ set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+ else
+ WARN_ON_ONCE(shadow_me_mask);
+
for (i = 0; i < 4; ++i)
- mmu->pae_root[i] = INVALID_PAGE;
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
return 0;
}
@@ -5365,6 +5421,15 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
*/
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
+ /* In order to ensure all threads see this change when
+ * handling the MMU reload signal, this must happen in the
+ * same critical section as kvm_reload_remote_mmus, and
+ * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
+ * could drop the MMU lock and yield.
+ */
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_invalidate_all_roots(kvm);
+
/*
* Notify all vcpus to reload its shadow page table and flush TLB.
* Then all vcpus will switch to new shadow page table with the new
@@ -5377,10 +5442,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_zap_obsolete_pages(kvm);
- if (is_tdp_mmu_enabled(kvm))
- kvm_tdp_mmu_zap_all(kvm);
-
write_unlock(&kvm->mmu_lock);
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ read_unlock(&kvm->mmu_lock);
+ }
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5420,7 +5488,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i;
- bool flush;
+ bool flush = false;
write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -5433,20 +5501,31 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (start >= end)
continue;
- slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
- PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true);
+ flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PG_LEVEL_4K,
+ KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
}
}
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
+ write_unlock(&kvm->mmu_lock);
+
if (is_tdp_mmu_enabled(kvm)) {
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+ flush = false;
+
+ read_lock(&kvm->mmu_lock);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+ gfn_end, flush, true);
if (flush)
- kvm_flush_remote_tlbs(kvm);
- }
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+ gfn_end);
- write_unlock(&kvm->mmu_lock);
+ read_unlock(&kvm->mmu_lock);
+ }
}
static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5465,10 +5544,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
write_lock(&kvm->mmu_lock);
flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
- if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+ read_unlock(&kvm->mmu_lock);
+ }
+
/*
* We can flush all the TLBs out of the mmu lock without TLB
* corruption since we just change the spte from writable to
@@ -5476,9 +5559,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
* spte from present to present (changing the spte from present
* to nonpresent will flush all the TLBs immediately), in other
* words, the only case we care is mmu_spte_update() where we
- * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
- * instead of PT_WRITABLE_MASK, that means it does not depend
- * on PT_WRITABLE_MASK anymore.
+ * have checked Host-writable | MMU-writable instead of
+ * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
+ * anymore.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5529,21 +5612,32 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
+ bool flush;
write_lock(&kvm->mmu_lock);
- slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+ flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
- if (is_tdp_mmu_enabled(kvm))
- kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+ if (flush)
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ flush = false;
+
+ read_lock(&kvm->mmu_lock);
+ flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
+ if (flush)
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ read_unlock(&kvm->mmu_lock);
+ }
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
/*
* All current use cases for flushing the TLBs for a specific memslot
- * are related to dirty logging, and do the TLB flush out of mmu_lock.
+ * related to dirty logging, and many do the TLB flush out of mmu_lock.
* The interaction between the various operations on memslot must be
* serialized by slots_locks to ensure the TLB flush from one operation
* is observed by any other operation on the same memslot.
@@ -5560,10 +5654,14 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
write_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
- if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+ read_unlock(&kvm->mmu_lock);
+ }
+
/*
* It's also safe to flush TLBs out of mmu lock here as currently this
* function is only used for dirty logging, in which case flushing TLB
@@ -5701,25 +5799,6 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
-static void kvm_set_mmio_spte_mask(void)
-{
- u64 mask;
-
- /*
- * Set a reserved PA bit in MMIO SPTEs to generate page faults with
- * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
- * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
- * 52-bit physical addresses then there are no reserved PA bits in the
- * PTEs and so the reserved PA approach must be disabled.
- */
- if (shadow_phys_bits < 52)
- mask = BIT_ULL(51) | PT_PRESENT_MASK;
- else
- mask = 0;
-
- kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
-}
-
static bool get_nx_auto_mode(void)
{
/* Return true when CPU has the bug, and mitigations are ON */
@@ -5785,8 +5864,6 @@ int kvm_mmu_module_init(void)
kvm_mmu_reset_all_pte_masks();
- kvm_set_mmio_spte_mask();
-
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, SLAB_ACCOUNT, NULL);
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index ced15fd58fde..cedc17b2f60e 100644
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -70,7 +70,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
- if (root && VALID_PAGE(root)) {
+ if (IS_VALID_PAE_ROOT(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = to_shadow_page(root);
__mmu_spte_walk(vcpu, sp, fn, 2);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 1f6f98c76bdf..f2546d6d390c 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -20,6 +20,16 @@ extern bool dbg;
#define MMU_WARN_ON(x) do { } while (0)
#endif
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid. And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT 0
+#define IS_VALID_PAE_ROOT(x) (!!(x))
+
struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
@@ -40,7 +50,11 @@ struct kvm_mmu_page {
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
- int root_count; /* Currently serving as active root */
+ /* Currently serving as active root */
+ union {
+ int root_count;
+ refcount_t tdp_mmu_root_count;
+ };
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
DECLARE_BITMAP(unsync_child_bitmap, 512);
@@ -78,9 +92,14 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+ return role.smm ? 1 : 0;
+}
+
static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
{
- return sp->role.smm ? 1 : 0;
+ return kvm_mmu_role_as_id(sp->role);
}
static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
@@ -108,22 +127,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
-static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- BUG_ON(!sp->root_count);
- lockdep_assert_held(&kvm->mmu_lock);
-
- ++sp->root_count;
-}
-
-static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- lockdep_assert_held(&kvm->mmu_lock);
- --sp->root_count;
-
- return !sp->root_count;
-}
-
/*
* Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
*
@@ -146,8 +149,9 @@ enum {
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
#define SET_SPTE_SPURIOUS BIT(2)
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level);
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
bool huge_page_disallowed, int *req_level);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 55d7b473ac44..70b7e44e3035 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -503,6 +503,7 @@ error:
#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+ walker->fault.async_page_fault = false;
trace_kvm_mmu_walker_error(walker->fault.error_code);
return 0;
@@ -1084,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
nr_present++;
- host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+ host_writable = sp->spt[i] & shadow_host_writable_mask;
set_spte_ret |= set_spte(vcpu, &sp->spt[i],
pte_access, PG_LEVEL_4K,
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index ef55f0bc4ccf..66d43cec0c31 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -16,13 +16,20 @@
#include "spte.h"
#include <asm/e820/api.h>
+#include <asm/vmx.h>
+static bool __read_mostly enable_mmio_caching = true;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
u64 __read_mostly shadow_nx_mask;
u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
u64 __read_mostly shadow_user_mask;
u64 __read_mostly shadow_accessed_mask;
u64 __read_mostly shadow_dirty_mask;
u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
u64 __read_mostly shadow_mmio_access_mask;
u64 __read_mostly shadow_present_mask;
u64 __read_mostly shadow_me_mask;
@@ -38,7 +45,6 @@ static u64 generation_mmio_spte_mask(u64 gen)
u64 mask;
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
- BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -48,16 +54,18 @@ static u64 generation_mmio_spte_mask(u64 gen)
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
{
u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
- u64 mask = generation_mmio_spte_mask(gen);
+ u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
+ WARN_ON_ONCE(!shadow_mmio_value);
+
access &= shadow_mmio_access_mask;
- mask |= shadow_mmio_value | access;
- mask |= gpa | shadow_nonpresent_or_rsvd_mask;
- mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ spte |= shadow_mmio_value | access;
+ spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+ spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
- return mask;
+ return spte;
}
static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -86,13 +94,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
bool can_unsync, bool host_writable, bool ad_disabled,
u64 *new_spte)
{
- u64 spte = 0;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
int ret = 0;
if (ad_disabled)
- spte |= SPTE_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED_MASK;
else if (kvm_vcpu_ad_need_write_protect(vcpu))
- spte |= SPTE_AD_WRPROT_ONLY_MASK;
+ spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+
+ /*
+ * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set
+ * if PAE paging may be employed (shadow paging or any 32-bit KVM).
+ */
+ WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
+ (spte & SPTE_TDP_AD_MASK));
/*
* For the EPT case, shadow_present_mask is 0 if hardware
@@ -124,7 +139,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
kvm_is_mmio_pfn(pfn));
if (host_writable)
- spte |= SPTE_HOST_WRITEABLE;
+ spte |= shadow_host_writable_mask;
else
pte_access &= ~ACC_WRITE_MASK;
@@ -134,7 +149,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
spte |= (u64)pfn << PAGE_SHIFT;
if (pte_access & ACC_WRITE_MASK) {
- spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+ spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
/*
* Optimization: for pte sync, if spte was writable the hash
@@ -150,7 +165,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
__func__, gfn);
ret |= SET_SPTE_WRITE_PROTECTED_PT;
pte_access &= ~ACC_WRITE_MASK;
- spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
}
}
@@ -161,19 +176,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
spte = mark_spte_for_access_track(spte);
out:
+ WARN_ON(is_mmio_spte(spte));
*new_spte = spte;
return ret;
}
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
- u64 spte;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
- spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_me_mask;
+ spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_mask;
if (ad_disabled)
- spte |= SPTE_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED_MASK;
else
spte |= shadow_accessed_mask;
@@ -188,7 +204,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
new_spte |= (u64)new_pfn << PAGE_SHIFT;
new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~SPTE_HOST_WRITEABLE;
+ new_spte &= ~shadow_host_writable_mask;
new_spte = mark_spte_for_access_track(new_spte);
@@ -242,53 +258,68 @@ u64 mark_spte_for_access_track(u64 spte)
return spte;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
- WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
- shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
+
+ if (!enable_mmio_caching)
+ mmio_value = 0;
+
+ /*
+ * Disable MMIO caching if the MMIO value collides with the bits that
+ * are used to hold the relocated GFN when the L1TF mitigation is
+ * enabled. This should never fire as there is no known hardware that
+ * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+ * MMIO value are not susceptible to L1TF.
+ */
+ if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+ SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+ mmio_value = 0;
+
+ /*
+ * The masked MMIO value must obviously match itself and a removed SPTE
+ * must not get a false positive. Removed SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * not set any RWX bits.
+ */
+ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+ WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+ mmio_value = 0;
+
+ shadow_mmio_value = mmio_value;
+ shadow_mmio_mask = mmio_mask;
shadow_mmio_access_mask = access_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-/*
- * Sets the shadow PTE masks used by the MMU.
- *
- * Assumptions:
- * - Setting either @accessed_mask or @dirty_mask requires setting both
- * - At least one of @accessed_mask or @acc_track_mask must be set
- */
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask, u64 me_mask)
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
{
- BUG_ON(!dirty_mask != !accessed_mask);
- BUG_ON(!accessed_mask && !acc_track_mask);
- BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
-
- shadow_user_mask = user_mask;
- shadow_accessed_mask = accessed_mask;
- shadow_dirty_mask = dirty_mask;
- shadow_nx_mask = nx_mask;
- shadow_x_mask = x_mask;
- shadow_present_mask = p_mask;
- shadow_acc_track_mask = acc_track_mask;
- shadow_me_mask = me_mask;
+ shadow_user_mask = VMX_EPT_READABLE_MASK;
+ shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+ shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+ shadow_nx_mask = 0ull;
+ shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
+ shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_me_mask = 0ull;
+
+ shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
+
+ /*
+ * EPT Misconfigurations are generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ */
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+ VMX_EPT_RWX_MASK, 0);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
void kvm_mmu_reset_all_pte_masks(void)
{
u8 low_phys_bits;
-
- shadow_user_mask = 0;
- shadow_accessed_mask = 0;
- shadow_dirty_mask = 0;
- shadow_nx_mask = 0;
- shadow_x_mask = 0;
- shadow_present_mask = 0;
- shadow_acc_track_mask = 0;
+ u64 mask;
shadow_phys_bits = kvm_get_shadow_phys_bits();
@@ -315,4 +346,30 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_nonpresent_or_rsvd_lower_gfn_mask =
GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+ shadow_user_mask = PT_USER_MASK;
+ shadow_accessed_mask = PT_ACCESSED_MASK;
+ shadow_dirty_mask = PT_DIRTY_MASK;
+ shadow_nx_mask = PT64_NX_MASK;
+ shadow_x_mask = 0;
+ shadow_present_mask = PT_PRESENT_MASK;
+ shadow_acc_track_mask = 0;
+ shadow_me_mask = sme_me_mask;
+
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 6de3950fd704..bca0ba11cccf 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -5,18 +5,33 @@
#include "mmu_internal.h"
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+. MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
/*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE. This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
*/
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
+#define SPTE_TDP_AD_SHIFT 52
+#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
@@ -51,16 +66,46 @@
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page. This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+ PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
/*
- * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
* the memslots generation and is derived as follows:
*
- * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
@@ -71,39 +116,44 @@
*/
#define MMIO_SPTE_GEN_LOW_START 3
-#define MMIO_SPTE_GEN_LOW_END 11
+#define MMIO_SPTE_GEN_LOW_END 10
-#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_START 52
#define MMIO_SPTE_GEN_HIGH_END 62
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+ (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
/* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
extern u64 __read_mostly shadow_accessed_mask;
extern u64 __read_mostly shadow_dirty_mask;
extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
extern u64 __read_mostly shadow_mmio_access_mask;
extern u64 __read_mostly shadow_present_mask;
extern u64 __read_mostly shadow_me_mask;
/*
- * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
* pages.
*/
@@ -121,28 +171,21 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
/*
- * The mask/shift to use for saving the original R/X bits when marking the PTE
- * as not-present for access tracking purposes. We do not save the W bit as the
- * PTEs being access tracked also need to be dirty tracked, so the W bit will be
- * restored only when a write is attempted to the page.
- */
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
- PT64_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
-
-/*
* If a thread running without exclusive control of the MMU lock must perform a
* multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
* non-present intermediate value. Other threads which encounter this value
* should not modify the SPTE.
*
- * This constant works because it is considered non-present on both AMD and
- * Intel CPUs and does not create a L1TF vulnerability because the pfn section
- * is zeroed out.
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability. Use only low bits to avoid 64-bit immediates.
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE (1ull << 59)
+#define REMOVED_SPTE 0x5a0ULL
+
+/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
static inline bool is_removed_spte(u64 spte)
{
@@ -167,7 +210,13 @@ extern u8 __read_mostly shadow_phys_bits;
static inline bool is_mmio_spte(u64 spte)
{
- return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
+ return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ likely(shadow_mmio_value);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+ return !!(pte & SPTE_MMU_PRESENT_MASK);
}
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
@@ -177,25 +226,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
+ /*
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+ * and non-TDP SPTEs will never set these bits. Optimize for 64-bit
+ * TDP and do the A/D type check unconditionally.
+ */
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
@@ -204,11 +258,6 @@ static inline bool is_access_track_spte(u64 spte)
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
-static inline bool is_shadow_present_pte(u64 pte)
-{
- return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
-}
-
static inline bool is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
@@ -246,8 +295,8 @@ static inline bool is_dirty_spte(u64 spte)
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{
- return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
- (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+ return (spte & shadow_host_writable_mask) &&
+ (spte & shadow_mmu_writable_mask);
}
static inline u64 get_mmio_spte_generation(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 018d82e73e31..83cbdbe5de5a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
}
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+ bool shared)
+{
+ if (shared)
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
if (!kvm->arch.tdp_mmu_enabled)
@@ -41,32 +50,85 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
rcu_barrier();
}
-static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield, bool flush,
+ bool shared);
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
- if (kvm_mmu_put_root(kvm, root))
- kvm_tdp_mmu_free_root(kvm, root);
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
}
-static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
- struct kvm_mmu_page *root)
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
{
- lockdep_assert_held_write(&kvm->mmu_lock);
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
- if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
- return false;
+ tdp_mmu_free_sp(sp);
+}
- kvm_mmu_get_root(kvm, root);
- return true;
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+ gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+ if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+ return;
+
+ WARN_ON(!root->tdp_mmu_page);
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_del_rcu(&root->link);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+ zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
+
+ call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
-static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
- struct kvm_mmu_page *root)
+/*
+ * Finds the next valid root after root (or the first valid root if root
+ * is NULL), takes a reference on it, and returns that next root. If root
+ * is not NULL, this thread should have already taken a reference on it, and
+ * that reference will be dropped. If no valid root is found, this
+ * function will return NULL.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root,
+ bool shared)
{
struct kvm_mmu_page *next_root;
- next_root = list_next_entry(root, link);
- tdp_mmu_put_root(kvm, root);
+ rcu_read_lock();
+
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link, typeof(*next_root), link);
+
+ rcu_read_unlock();
+
+ if (prev_root)
+ kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+
return next_root;
}
@@ -75,35 +137,24 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* This makes it safe to release the MMU lock and yield within the loop, but
* if exiting the loop early, the caller must drop the reference to the most
* recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
*/
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
- for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \
- typeof(*_root), link); \
- tdp_mmu_next_root_valid(_kvm, _root); \
- _root = tdp_mmu_next_root(_kvm, _root))
-
-#define for_each_tdp_mmu_root(_kvm, _root) \
- list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
-
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush);
-
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WARN_ON(root->root_count);
- WARN_ON(!root->tdp_mmu_page);
-
- list_del(&root->link);
-
- zap_gfn_range(kvm, root, 0, max_gfn, false, false);
-
- free_page((unsigned long)root->spt);
- kmem_cache_free(mmu_page_header_cache, root);
-}
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
+ if (kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
+
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
+ lockdep_is_held_type(&kvm->mmu_lock, 0) || \
+ lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
+ if (kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
int level)
@@ -137,81 +188,46 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
return sp;
}
-static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
union kvm_mmu_page_role role;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
- role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
+ lockdep_assert_held_write(&kvm->mmu_lock);
- write_lock(&kvm->mmu_lock);
+ role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
/* Check for an existing root before allocating a new one. */
- for_each_tdp_mmu_root(kvm, root) {
- if (root->role.word == role.word) {
- kvm_mmu_get_root(kvm, root);
- write_unlock(&kvm->mmu_lock);
- return root;
- }
+ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ if (root->role.word == role.word &&
+ kvm_tdp_mmu_get_root(kvm, root))
+ goto out;
}
root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
- root->root_count = 1;
-
- list_add(&root->link, &kvm->arch.tdp_mmu_roots);
-
- write_unlock(&kvm->mmu_lock);
-
- return root;
-}
-
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *root;
+ refcount_set(&root->tdp_mmu_root_count, 1);
- root = get_tdp_mmu_vcpu_root(vcpu);
- if (!root)
- return INVALID_PAGE;
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out:
return __pa(root->spt);
}
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
- free_page((unsigned long)sp->spt);
- kmem_cache_free(mmu_page_header_cache, sp);
-}
-
-/*
- * This is called through call_rcu in order to free TDP page table memory
- * safely with respect to other kernel threads that may be operating on
- * the memory.
- * By only accessing TDP MMU page table memory in an RCU read critical
- * section, and freeing it after a grace period, lockless access to that
- * memory won't use it after it is freed.
- */
-static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
-{
- struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
- rcu_head);
-
- tdp_mmu_free_sp(sp);
-}
-
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
{
- bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
return;
if (is_accessed_spte(old_spte) &&
- (!is_accessed_spte(new_spte) || pfn_changed))
+ (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
+ spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
@@ -455,7 +471,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (was_leaf && is_dirty_spte(old_spte) &&
- (!is_dirty_spte(new_spte) || pfn_changed))
+ (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
/*
@@ -479,8 +495,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
}
/*
- * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
- * associated bookkeeping
+ * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping, but do not mark the page dirty
+ * in KVM's dirty bitmaps.
*
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
@@ -488,9 +505,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* Returns: true if the SPTE was set, false if it was not. If false is returned,
* this function will have no side-effects.
*/
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
lockdep_assert_held_read(&kvm->mmu_lock);
@@ -498,19 +515,32 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
* Do not change removed SPTEs. Only the thread that froze the SPTE
* may modify it.
*/
- if (iter->old_spte == REMOVED_SPTE)
+ if (is_removed_spte(iter->old_spte))
return false;
if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
new_spte) != iter->old_spte)
return false;
- handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
+ __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+ handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
return true;
}
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
+ return false;
+
+ handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+ return true;
+}
+
static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter)
{
@@ -569,7 +599,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
* should be used. If operating under the MMU lock in write mode, the
* use of the removed SPTE should not be necessary.
*/
- WARN_ON(iter->old_spte == REMOVED_SPTE);
+ WARN_ON(is_removed_spte(iter->old_spte));
WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
@@ -634,7 +664,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* Return false if a yield was not needed.
*/
static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
- struct tdp_iter *iter, bool flush)
+ struct tdp_iter *iter, bool flush,
+ bool shared)
{
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -646,7 +677,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
if (flush)
kvm_flush_remote_tlbs(kvm);
- cond_resched_rwlock_write(&kvm->mmu_lock);
+ if (shared)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
rcu_read_lock();
WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -664,24 +699,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
+ *
* If can_yield is true, will release the MMU lock and reschedule if the
* scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup. Note, in some use cases a flush may be
- * required by prior actions. Ensure the pending flush is performed prior to
- * yielding.
+ * operation can cause a soft lockup.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU lock in write mode.
*/
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush)
+ gfn_t start, gfn_t end, bool can_yield, bool flush,
+ bool shared)
{
struct tdp_iter iter;
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
+retry:
if (can_yield &&
- tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
flush = false;
continue;
}
@@ -699,8 +742,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
- flush = true;
+ if (!shared) {
+ tdp_mmu_set_spte(kvm, &iter, 0);
+ flush = true;
+ } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
}
rcu_read_unlock();
@@ -712,15 +764,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU in write mode.
*/
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
- bool can_yield)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+ gfn_t end, bool can_yield, bool flush,
+ bool shared)
{
struct kvm_mmu_page *root;
- bool flush = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root)
- flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+ flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
+ shared);
return flush;
}
@@ -728,14 +786,116 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- bool flush;
+ bool flush = false;
+ int i;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
+ flush, false);
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root)
+{
+ struct kvm_mmu_page *next_root;
+
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root && !(next_root->role.invalid &&
+ refcount_read(&next_root->tdp_mmu_root_count)))
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link,
+ typeof(*next_root), link);
+
+ return next_root;
+}
+
+/*
+ * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
+ * invalidated root, they will not be freed until this function drops the
+ * reference. Before dropping that reference, tear down the paging
+ * structure so that whichever thread does drop the last reference
+ * only has to do a trivial amount of work. Since the roots are invalid,
+ * no new SPTEs should be created under them.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+{
+ gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ struct kvm_mmu_page *next_root;
+ struct kvm_mmu_page *root;
+ bool flush = false;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ root = next_invalidated_root(kvm, NULL);
+
+ while (root) {
+ next_root = next_invalidated_root(kvm, root);
+
+ rcu_read_unlock();
+
+ flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
+ true);
+
+ /*
+ * Put the reference acquired in
+ * kvm_tdp_mmu_invalidate_roots
+ */
+ kvm_tdp_mmu_put_root(kvm, root, true);
+
+ root = next_root;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
if (flush)
kvm_flush_remote_tlbs(kvm);
}
/*
+ * Mark each TDP MMU root as invalid so that other threads
+ * will drop their references and allow the root count to
+ * go to 0.
+ *
+ * Also take a reference on all roots so that this thread
+ * can do the bulk of the work required to free the roots
+ * once they are invalidated. Without this reference, a
+ * vCPU thread might drop the last reference to a root and
+ * get stuck with tearing down the entire paging structure.
+ *
+ * Roots which have a zero refcount should be skipped as
+ * they're already being torn down.
+ * Already invalid roots should be referenced again so that
+ * they aren't freed before kvm_tdp_mmu_zap_all_fast is
+ * done with them.
+ *
+ * This has essentially the same effect for the TDP MMU
+ * as updating mmu_valid_gen does for the shadow MMU.
+ */
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
+ if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+ root->role.invalid = true;
+}
+
+/*
* Installs a last-level SPTE to handle a TDP page fault.
* (NPT/EPT violation/misconfiguration)
*/
@@ -777,12 +937,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
ret = RET_PF_EMULATE;
- } else
+ } else {
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
rcu_dereference(iter->sptep));
+ }
- trace_kvm_mmu_set_spte(iter->level, iter->gfn,
- rcu_dereference(iter->sptep));
if (!prefault)
vcpu->stat.pf_fixed++;
@@ -882,199 +1041,139 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
return ret;
}
-static __always_inline int
-kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root,
- gfn_t start,
- gfn_t end,
- unsigned long data))
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
struct kvm_mmu_page *root;
- int ret = 0;
- int as_id;
-
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- as_id = kvm_mmu_page_as_id(root);
- slots = __kvm_memslots(kvm, as_id);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- ret |= handler(kvm, memslot, root, gfn_start,
- gfn_end, data);
- }
- }
+ for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
+ flush |= zap_gfn_range(kvm, root, range->start, range->end,
+ range->may_block, flush, false);
- return ret;
+ return flush;
}
-static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start,
- gfn_t end, unsigned long unused)
-{
- return zap_gfn_range(kvm, root, start, end, false, false);
-}
+typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range);
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ tdp_handler_t handler)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
- zap_gfn_range_hva_wrapper);
+ struct kvm_mmu_page *root;
+ struct tdp_iter iter;
+ bool ret = false;
+
+ rcu_read_lock();
+
+ /*
+ * Don't support rescheduling, none of the MMU notifiers that funnel
+ * into this helper allow blocking; it'd be dead, wasteful code.
+ */
+ for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+ tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+ ret |= handler(kvm, &iter, range);
+ }
+
+ rcu_read_unlock();
+
+ return ret;
}
/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
*/
-static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start, gfn_t end,
- unsigned long unused)
+static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
- int young = 0;
u64 new_spte = 0;
- rcu_read_lock();
+ /* If we have a non-accessed entry we don't need to change the pte. */
+ if (!is_accessed_spte(iter->old_spte))
+ return false;
- tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ new_spte = iter->old_spte;
+
+ if (spte_ad_enabled(new_spte)) {
+ new_spte &= ~shadow_accessed_mask;
+ } else {
/*
- * If we have a non-accessed entry we don't need to change the
- * pte.
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
*/
- if (!is_accessed_spte(iter.old_spte))
- continue;
-
- new_spte = iter.old_spte;
-
- if (spte_ad_enabled(new_spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)&new_spte);
- } else {
- /*
- * Capture the dirty status of the page, so that it doesn't get
- * lost when the SPTE is marked for access tracking.
- */
- if (is_writable_pte(new_spte))
- kvm_set_pfn_dirty(spte_to_pfn(new_spte));
+ if (is_writable_pte(new_spte))
+ kvm_set_pfn_dirty(spte_to_pfn(new_spte));
- new_spte = mark_spte_for_access_track(new_spte);
- }
- new_spte &= ~shadow_dirty_mask;
-
- tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
- young = 1;
-
- trace_kvm_age_page(iter.gfn, iter.level, slot, young);
+ new_spte = mark_spte_for_access_track(new_spte);
}
- rcu_read_unlock();
+ tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
- return young;
+ return true;
}
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end)
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
- age_gfn_range);
+ return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
}
-static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
- unsigned long unused2)
+static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
-
- tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
- if (is_accessed_spte(iter.old_spte))
- return 1;
-
- return 0;
+ return is_accessed_spte(iter->old_spte);
}
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
- test_age_gfn);
+ return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
}
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
- unsigned long data)
+static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
- pte_t *ptep = (pte_t *)data;
- kvm_pfn_t new_pfn;
u64 new_spte;
- int need_flush = 0;
-
- rcu_read_lock();
- WARN_ON(pte_huge(*ptep));
+ /* Huge pages aren't expected to be modified without first being zapped. */
+ WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
- new_pfn = pte_pfn(*ptep);
-
- tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
- if (iter.level != PG_LEVEL_4K)
- continue;
-
- if (!is_shadow_present_pte(iter.old_spte))
- break;
-
- tdp_mmu_set_spte(kvm, &iter, 0);
-
- kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
+ if (iter->level != PG_LEVEL_4K ||
+ !is_shadow_present_pte(iter->old_spte))
+ return false;
- if (!pte_write(*ptep)) {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(
- iter.old_spte, new_pfn);
+ /*
+ * Note, when changing a read-only SPTE, it's not strictly necessary to
+ * zero the SPTE before setting the new PFN, but doing so preserves the
+ * invariant that the PFN of a present * leaf SPTE can never change.
+ * See __handle_changed_spte().
+ */
+ tdp_mmu_set_spte(kvm, iter, 0);
- tdp_mmu_set_spte(kvm, &iter, new_spte);
- }
+ if (!pte_write(range->pte)) {
+ new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
+ pte_pfn(range->pte));
- need_flush = 1;
+ tdp_mmu_set_spte(kvm, iter, new_spte);
}
- if (need_flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
-
- rcu_read_unlock();
-
- return 0;
+ return true;
}
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
- pte_t *host_ptep)
+/*
+ * Handle the changed_pte MMU notifier for the TDP MMU.
+ * data is a pointer to the new pte_t mapping the HVA specified by the MMU
+ * notifier.
+ * Returns non-zero if a flush is needed before releasing the MMU lock.
+ */
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
- (unsigned long)host_ptep,
- set_tdp_spte);
+ bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
+
+ /* FIXME: return 'flush' instead of flushing here. */
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
+
+ return false;
}
/*
@@ -1095,7 +1194,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1105,7 +1205,15 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+ new_spte)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
spte_set = true;
}
@@ -1122,17 +1230,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
int min_level)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
- }
return spte_set;
}
@@ -1154,7 +1258,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1169,7 +1274,15 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
}
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+ new_spte)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
spte_set = true;
}
@@ -1187,17 +1300,13 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
- }
return spte_set;
}
@@ -1259,37 +1368,32 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
bool wrprot)
{
struct kvm_mmu_page *root;
- int root_as_id;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
-
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
- }
}
/*
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
-static void zap_collapsible_spte_range(struct kvm *kvm,
+static bool zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot,
+ bool flush)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
kvm_pfn_t pfn;
- bool spte_set = false;
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
- spte_set = false;
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+ flush = false;
continue;
}
@@ -1303,38 +1407,43 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
pfn, PG_LEVEL_NUM))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
-
- spte_set = true;
+ if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
+ flush = true;
}
rcu_read_unlock();
- if (spte_set)
- kvm_flush_remote_tlbs(kvm);
+
+ return flush;
}
/*
* Clear non-leaf entries (and free associated page tables) which could
* be replaced by large mappings, for GFNs within the slot.
*/
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ bool flush)
{
struct kvm_mmu_page *root;
- int root_as_id;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
- zap_collapsible_spte_range(kvm, root, slot);
- }
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ flush = zap_collapsible_spte_range(kvm, root, slot, flush);
+
+ return flush;
}
/*
* Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
* Returns true if an SPTE was set and a TLB flush is needed.
*/
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -1351,7 +1460,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
break;
new_spte = iter.old_spte &
- ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
@@ -1364,24 +1473,19 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
/*
* Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
* Returns true if an SPTE was set and a TLB flush is needed.
*/
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
-
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
spte_set |= write_protect_gfn(kvm, root, gfn);
- }
+
return spte_set;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 31096ece9b14..5fdf63090451 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -6,14 +6,28 @@
#include <linux/kvm_host.h>
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
- bool can_yield);
-static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
- gfn_t end)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
+ struct kvm_mmu_page *root)
{
- return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
+ if (root->role.invalid)
+ return false;
+
+ return refcount_inc_not_zero(&root->tdp_mmu_root_count);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared);
+
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+ gfn_t end, bool can_yield, bool flush,
+ bool shared);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
+ gfn_t start, gfn_t end, bool flush,
+ bool shared)
+{
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
+ shared);
}
static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
@@ -29,23 +43,23 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
* of the shadow page's gfn range and stop iterating before yielding.
*/
lockdep_assert_held_write(&kvm->mmu_lock);
- return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
+ sp->gfn, end, false, false, false);
}
+
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int map_writable, int max_level, kvm_pfn_t pfn,
bool prefault);
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end);
-
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end);
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva);
-
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
- pte_t *host_ptep);
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
int min_level);
@@ -55,8 +69,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *slot);
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ bool flush);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 78bdcfac4e40..cd0285f15a68 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -270,7 +270,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
return -EINVAL;
- if (!svm->vcpu.arch.apic->regs)
+ if (!vcpu->arch.apic->regs)
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
@@ -281,7 +281,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return ret;
}
- svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+ svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
/* Setting AVIC backing page address in the phy APIC ID table */
entry = avic_get_physical_id_entry(vcpu, id);
@@ -315,15 +315,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
}
}
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
u32 icrl = svm->vmcb->control.exit_info_1;
u32 id = svm->vmcb->control.exit_info_2 >> 32;
u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+ trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
switch (id) {
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
@@ -347,11 +348,11 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
* set the appropriate IRR bits on the valid target
* vcpus. So, we just need to kick the appropriate vcpu.
*/
- avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
+ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_INVALID_TARGET:
WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
- index, svm->vcpu.vcpu_id, icrh, icrl);
+ index, vcpu->vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
@@ -539,8 +540,9 @@ static bool is_avic_unaccelerated_access_trap(u32 offset)
return ret;
}
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret = 0;
u32 offset = svm->vmcb->control.exit_info_1 &
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
@@ -550,7 +552,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
AVIC_UNACCEL_ACCESS_WRITE_MASK;
bool trap = is_avic_unaccelerated_access_trap(offset);
- trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+ trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
trap, write, vector);
if (trap) {
/* Handling Trap */
@@ -558,7 +560,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
ret = avic_unaccel_trap_write(svm);
} else {
/* Handling Fault */
- ret = kvm_emulate_instruction(&svm->vcpu, 0);
+ ret = kvm_emulate_instruction(vcpu, 0);
}
return ret;
@@ -572,7 +574,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
if (!avic || !irqchip_in_kernel(vcpu->kvm))
return 0;
- ret = avic_init_backing_page(&svm->vcpu);
+ ret = avic_init_backing_page(vcpu);
if (ret)
return ret;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index fb204eaa8bb3..540d43ba2cf4 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -29,6 +29,8 @@
#include "lapic.h"
#include "svm.h"
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
@@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
WARN_ON(mmu_is_nested(vcpu));
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
- kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ svm->vmcb01.ptr->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
@@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
return;
c = &svm->vmcb->control;
- h = &svm->nested.hsave->control;
+ h = &svm->vmcb01.ptr->control;
g = &svm->nested.ctl;
for (i = 0; i < MAX_INTERCEPT; i++)
@@ -213,44 +215,64 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
return true;
}
-static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
{
- struct vcpu_svm *svm = to_svm(vcpu);
+ u64 addr = PAGE_ALIGN(pa);
- if (WARN_ON(!is_guest_mode(vcpu)))
- return true;
-
- if (!nested_svm_vmrun_msrpm(svm)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return false;
- }
-
- return true;
+ return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+ kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
}
-static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_control_area *control)
{
- if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
+ if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
return false;
- if (control->asid == 0)
+ if (CC(control->asid == 0))
return false;
- if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
- !npt_enabled)
+ if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+ return false;
+
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+ MSRPM_SIZE)))
+ return false;
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+ IOPM_SIZE)))
return false;
return true;
}
-static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area *save)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- bool vmcb12_lma;
+ /*
+ * These checks are also performed by KVM_SET_SREGS,
+ * except that EFER.LMA is not checked by SVM against
+ * CR0.PG && EFER.LME.
+ */
+ if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+ if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+ CC(!(save->cr0 & X86_CR0_PE)) ||
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+ return false;
+ }
+
+ if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
+ return false;
+
+ return true;
+}
+/* Common checks that apply to both L1 and L2 state. */
+static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area *save)
+{
/*
* FIXME: these should be done after copying the fields,
* to avoid TOC/TOU races. For these save area checks
@@ -258,31 +280,27 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
* kvm_set_cr4 handle failure; EFER_SVME is an exception
* so it is force-set later in nested_prepare_vmcb_save.
*/
- if ((vmcb12->save.efer & EFER_SVME) == 0)
+ if (CC(!(save->efer & EFER_SVME)))
return false;
- if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
return false;
- if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
return false;
- vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
+ if (!nested_vmcb_check_cr3_cr4(vcpu, save))
+ return false;
- if (vmcb12_lma) {
- if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
- !(vmcb12->save.cr0 & X86_CR0_PE) ||
- kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
- return false;
- }
- if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
return false;
return true;
}
-static void load_nested_vmcb_control(struct vcpu_svm *svm,
- struct vmcb_control_area *control)
+static void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
{
copy_vmcb_control_area(&svm->nested.ctl, control);
@@ -294,9 +312,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
/*
* Synchronize fields that are written by the processor, so that
- * they can be copied back into the nested_vmcb.
+ * they can be copied back into the vmcb12.
*/
-void sync_nested_vmcb_control(struct vcpu_svm *svm)
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
{
u32 mask;
svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
@@ -324,8 +342,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
* Transfer any event that L0 or L1 wanted to inject into L2 to
* EXIT_INT_INFO.
*/
-static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
- struct vmcb *vmcb12)
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+ struct vmcb *vmcb12)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 exit_int_info = 0;
@@ -369,12 +387,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
bool nested_npt)
{
- if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
return -EINVAL;
if (!nested_npt && is_pae_paging(vcpu) &&
(cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
- if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
return -EINVAL;
}
@@ -393,15 +411,42 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
return 0;
}
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
{
+ if (!svm->nested.vmcb02.ptr)
+ return;
+
+ /* FIXME: merge g_pat from vmcb01 and vmcb12. */
+ svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+ bool new_vmcb12 = false;
+
+ nested_vmcb02_compute_g_pat(svm);
+
/* Load the nested guest state */
- svm->vmcb->save.es = vmcb12->save.es;
- svm->vmcb->save.cs = vmcb12->save.cs;
- svm->vmcb->save.ss = vmcb12->save.ss;
- svm->vmcb->save.ds = vmcb12->save.ds;
- svm->vmcb->save.gdtr = vmcb12->save.gdtr;
- svm->vmcb->save.idtr = vmcb12->save.idtr;
+ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+ new_vmcb12 = true;
+ svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+ svm->vmcb->save.es = vmcb12->save.es;
+ svm->vmcb->save.cs = vmcb12->save.cs;
+ svm->vmcb->save.ss = vmcb12->save.ss;
+ svm->vmcb->save.ds = vmcb12->save.ds;
+ svm->vmcb->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+ svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+ svm->vmcb->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+ }
+
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
/*
@@ -413,7 +458,9 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
- svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+ svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
@@ -422,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.rax = vmcb12->save.rax;
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
- svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
- svm->vmcb->save.cpl = vmcb12->save.cpl;
+
+ /* These bits will be set properly on the first execution when new_vmc12 is true */
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+ svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+ }
}
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
{
const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+ /*
+ * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+ * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+ */
+
+ /*
+ * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
+ * avic_physical_id.
+ */
+ WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
+
+ /* Copied from vmcb01. msrpm_base can be overwritten later. */
+ svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
+ svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
+ svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+
+ /* Done at vmrun: asid. */
+
+ /* Also overwritten later if necessary. */
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* nested_cr3. */
if (nested_npt_enabled(svm))
nested_svm_init_mmu_context(&svm->vcpu);
@@ -439,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & ~mask) |
- (svm->nested.hsave->control.int_ctl & mask);
+ (svm->vmcb01.ptr->control.int_ctl & mask);
svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
@@ -454,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
enter_guest_mode(&svm->vcpu);
/*
- * Merge guest and host intercepts - must be called with vcpu in
- * guest-mode to take affect here
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take effect.
*/
recalc_intercepts(svm);
+}
- vmcb_mark_all_dirty(svm->vmcb);
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ /*
+ * Some VMCB state is shared between L1 and L2 and thus has to be
+ * moved at the time of nested vmrun and vmexit.
+ *
+ * VMLOAD/VMSAVE state would also belong in this category, but KVM
+ * always performs VMLOAD and VMSAVE from the VMCB01.
+ */
+ to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
}
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
struct vmcb *vmcb12)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret;
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
@@ -482,8 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
svm->nested.vmcb12_gpa = vmcb12_gpa;
- nested_prepare_vmcb_control(svm);
- nested_prepare_vmcb_save(svm, vmcb12);
+
+ WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+ nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm);
+ nested_vmcb02_prepare_save(svm, vmcb12);
ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
nested_npt_enabled(svm));
@@ -491,47 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
return ret;
if (!npt_enabled)
- svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+ vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
svm_set_gif(svm, true);
return 0;
}
-int nested_svm_vmrun(struct vcpu_svm *svm)
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret;
struct vmcb *vmcb12;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb12_gpa;
- if (is_smm(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ ++vcpu->stat.nested_run;
+
+ if (is_smm(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
vmcb12_gpa = svm->vmcb->save.rax;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
if (ret == -EINVAL) {
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
} else if (ret) {
- return kvm_skip_emulated_instruction(&svm->vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ ret = kvm_skip_emulated_instruction(vcpu);
vmcb12 = map.hva;
if (WARN_ON_ONCE(!svm->nested.initialized))
return -EINVAL;
- load_nested_vmcb_control(svm, &vmcb12->control);
+ nested_load_control_from_vmcb12(svm, &vmcb12->control);
- if (!nested_vmcb_check_save(svm, vmcb12) ||
- !nested_vmcb_check_controls(&svm->nested.ctl)) {
+ if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
+ !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
vmcb12->control.exit_info_1 = 0;
@@ -541,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
/* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
/*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
+ * Since vmcb01 is not in use, we can use it to store some of the L1
+ * state.
*/
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(&hsave->control, &vmcb->control);
+ svm->vmcb01.ptr->save.efer = vcpu->arch.efer;
+ svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu);
+ svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4;
+ svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
+ svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu);
+
+ if (!npt_enabled)
+ svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
svm->nested.nested_run_pending = 1;
- if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12))
goto out_exit_err;
if (nested_svm_vmrun_msrpm(svm))
@@ -587,7 +667,7 @@ out_exit_err:
nested_svm_vmexit(svm);
out:
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
@@ -610,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
int nested_svm_vmexit(struct vcpu_svm *svm)
{
- int rc;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
struct vmcb *vmcb12;
- struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
+ int rc;
- rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+ /* Triple faults in L2 should never escape. */
+ WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
+ rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
if (rc) {
if (rc == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
vmcb12 = map.hva;
/* Exit Guest-Mode */
- leave_guest_mode(&svm->vcpu);
+ leave_guest_mode(vcpu);
svm->nested.vmcb12_gpa = 0;
WARN_ON_ONCE(svm->nested.nested_run_pending);
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
/* in case we halted in L2 */
svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -644,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->save.gdtr = vmcb->save.gdtr;
vmcb12->save.idtr = vmcb->save.idtr;
vmcb12->save.efer = svm->vcpu.arch.efer;
- vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu);
- vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ vmcb12->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(vcpu);
vmcb12->save.cr2 = vmcb->save.cr2;
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
- vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
- vmcb12->save.rip = kvm_rip_read(&svm->vcpu);
- vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu);
- vmcb12->save.rax = kvm_rax_read(&svm->vcpu);
+ vmcb12->save.rflags = kvm_get_rflags(vcpu);
+ vmcb12->save.rip = kvm_rip_read(vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(vcpu);
+ vmcb12->save.rax = kvm_rax_read(vcpu);
vmcb12->save.dr7 = vmcb->save.dr7;
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
vmcb12->save.cpl = vmcb->save.cpl;
@@ -663,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
- nested_vmcb_save_pending_event(svm, vmcb12);
+ nested_save_pending_event_to_vmcb12(svm, vmcb12);
if (svm->nrips_enabled)
vmcb12->control.next_rip = vmcb->control.next_rip;
@@ -678,37 +761,39 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.pause_filter_thresh =
svm->vmcb->control.pause_filter_thresh;
- /* Restore the original control entries */
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
+ nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+ WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
- /* On vmexit the GIF is set to false */
+ /*
+ * On vmexit the GIF is set to false and
+ * no event can be injected in L1.
+ */
svm_set_gif(svm, false);
+ svm->vmcb->control.exit_int_info = 0;
- svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
- svm->vcpu.arch.l1_tsc_offset;
+ svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+ if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ }
svm->nested.ctl.nested_cr3 = 0;
- /* Restore selected save entries */
- svm->vmcb->save.es = hsave->save.es;
- svm->vmcb->save.cs = hsave->save.cs;
- svm->vmcb->save.ss = hsave->save.ss;
- svm->vmcb->save.ds = hsave->save.ds;
- svm->vmcb->save.gdtr = hsave->save.gdtr;
- svm->vmcb->save.idtr = hsave->save.idtr;
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
- svm_set_efer(&svm->vcpu, hsave->save.efer);
- svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
- svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- kvm_rax_write(&svm->vcpu, hsave->save.rax);
- kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
- kvm_rip_write(&svm->vcpu, hsave->save.rip);
- svm->vmcb->save.dr7 = DR7_FIXED_1;
- svm->vmcb->save.cpl = 0;
- svm->vmcb->control.exit_int_info = 0;
+ /*
+ * Restore processor state that had been saved in vmcb01
+ */
+ kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
+ svm_set_efer(vcpu, svm->vmcb->save.efer);
+ svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, svm->vmcb->save.cr4);
+ kvm_rax_write(vcpu, svm->vmcb->save.rax);
+ kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
+ kvm_rip_write(vcpu, svm->vmcb->save.rip);
- vmcb_mark_all_dirty(svm->vmcb);
+ svm->vcpu.arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(&svm->vcpu);
trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
vmcb12->control.exit_info_1,
@@ -717,50 +802,62 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.exit_int_info_err,
KVM_ISA_SVM);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ kvm_vcpu_unmap(vcpu, &map, true);
- nested_svm_uninit_mmu_context(&svm->vcpu);
+ nested_svm_uninit_mmu_context(vcpu);
- rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+ rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false);
if (rc)
return 1;
- if (npt_enabled)
- svm->vmcb->save.cr3 = hsave->save.cr3;
-
/*
* Drop what we picked up for L2 via svm_complete_interrupts() so it
* doesn't end up in L1.
*/
svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ /*
+ * If we are here following the completion of a VMRUN that
+ * is being single-stepped, queue the pending #DB intercept
+ * right now so that it an be accounted for before we execute
+ * L1's next instruction.
+ */
+ if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+ kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
return 0;
}
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+ nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
int svm_allocate_nested(struct vcpu_svm *svm)
{
- struct page *hsave_page;
+ struct page *vmcb02_page;
if (svm->nested.initialized)
return 0;
- hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!hsave_page)
+ vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb02_page)
return -ENOMEM;
- svm->nested.hsave = page_address(hsave_page);
+ svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+ svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
if (!svm->nested.msrpm)
- goto err_free_hsave;
+ goto err_free_vmcb02;
svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
svm->nested.initialized = true;
return 0;
-err_free_hsave:
- __free_page(hsave_page);
+err_free_vmcb02:
+ __free_page(vmcb02_page);
return -ENOMEM;
}
@@ -772,8 +869,8 @@ void svm_free_nested(struct vcpu_svm *svm)
svm_vcpu_free_msrpm(svm->nested.msrpm);
svm->nested.msrpm = NULL;
- __free_page(virt_to_page(svm->nested.hsave));
- svm->nested.hsave = NULL;
+ __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+ svm->nested.vmcb02.ptr = NULL;
svm->nested.initialized = false;
}
@@ -783,18 +880,19 @@ void svm_free_nested(struct vcpu_svm *svm)
*/
void svm_leave_nested(struct vcpu_svm *svm)
{
- if (is_guest_mode(&svm->vcpu)) {
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0;
- leave_guest_mode(&svm->vcpu);
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
- nested_svm_uninit_mmu_context(&svm->vcpu);
+ leave_guest_mode(vcpu);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+ nested_svm_uninit_mmu_context(vcpu);
vmcb_mark_all_dirty(svm->vmcb);
}
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -903,16 +1001,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm)
return vmexit;
}
-int nested_svm_check_permissions(struct vcpu_svm *svm)
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
{
- if (!(svm->vcpu.arch.efer & EFER_SVME) ||
- !is_paging(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (svm->vmcb->save.cpl) {
- kvm_inject_gp(&svm->vcpu, 0);
+ if (to_svm(vcpu)->vmcb->save.cpl) {
+ kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -960,50 +1057,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
nested_svm_vmexit(svm);
}
-static void nested_svm_smi(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_SMI;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-static void nested_svm_nmi(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_NMI;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
- trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
}
-static void nested_svm_init(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_INIT;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1017,12 +1075,18 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_init(svm))
return 0;
- nested_svm_init(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
return 0;
}
if (vcpu->arch.exception.pending) {
- if (block_nested_events)
+ /*
+ * Only a pending nested run can block a pending exception.
+ * Otherwise an injected NMI/interrupt should either be
+ * lost or delivered to the nested hypervisor in the EXITINTINFO
+ * vmcb field, while delivering the pending exception.
+ */
+ if (svm->nested.nested_run_pending)
return -EBUSY;
if (!nested_exit_on_exception(svm))
return 0;
@@ -1035,7 +1099,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_smi(svm))
return 0;
- nested_svm_smi(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
return 0;
}
@@ -1044,7 +1108,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_nmi(svm))
return 0;
- nested_svm_nmi(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
return 0;
}
@@ -1053,7 +1117,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_intr(svm))
return 0;
- nested_svm_intr(svm);
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
return 0;
}
@@ -1072,8 +1137,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
- excp_bits)
+ if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
return NESTED_EXIT_HOST;
else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
svm->vcpu.arch.apf.host_apf_flags)
@@ -1137,10 +1202,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
sizeof(user_vmcb->control)))
return -EFAULT;
- if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+ if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
sizeof(user_vmcb->save)))
return -EFAULT;
-
out:
return kvm_state.size;
}
@@ -1150,7 +1214,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state *kvm_state)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
struct vmcb __user *user_vmcb = (struct vmcb __user *)
&user_kvm_nested_state->data.svm[0];
struct vmcb_control_area *ctl;
@@ -1195,8 +1258,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
ret = -ENOMEM;
- ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
- save = kzalloc(sizeof(*save), GFP_KERNEL);
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT);
+ save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
if (!ctl || !save)
goto out_free;
@@ -1207,12 +1270,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
goto out_free;
ret = -EINVAL;
- if (!nested_vmcb_check_controls(ctl))
+ if (!nested_vmcb_check_controls(vcpu, ctl))
goto out_free;
/*
* Processor state contains L2 state. Check that it is
- * valid for guest mode (see nested_vmcb_checks).
+ * valid for guest mode (see nested_vmcb_check_save).
*/
cr0 = kvm_read_cr0(vcpu);
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
@@ -1221,29 +1284,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
/*
* Validate host state saved from before VMRUN (see
* nested_svm_check_permissions).
- * TODO: validate reserved bits for all saved state.
*/
- if (!(save->cr0 & X86_CR0_PG))
- goto out_free;
- if (!(save->efer & EFER_SVME))
+ if (!(save->cr0 & X86_CR0_PG) ||
+ !(save->cr0 & X86_CR0_PE) ||
+ (save->rflags & X86_EFLAGS_VM) ||
+ !nested_vmcb_valid_sregs(vcpu, save))
goto out_free;
/*
- * All checks done, we can enter guest mode. L1 control fields
- * come from the nested save state. Guest state is already
- * in the registers, the save area of the nested state instead
- * contains saved L1 state.
+ * All checks done, we can enter guest mode. Userspace provides
+ * vmcb12.control, which will be combined with L1 and stored into
+ * vmcb02, and the L1 save state which we store in vmcb01.
+ * L2 registers if needed are moved from the current VMCB to VMCB02.
*/
svm->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
- copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
- hsave->save = *save;
-
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
- load_nested_vmcb_control(svm, ctl);
- nested_prepare_vmcb_control(svm);
+ if (svm->current_vmcb == &svm->vmcb01)
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm->vmcb01.ptr->save.es = save->es;
+ svm->vmcb01.ptr->save.cs = save->cs;
+ svm->vmcb01.ptr->save.ss = save->ss;
+ svm->vmcb01.ptr->save.ds = save->ds;
+ svm->vmcb01.ptr->save.gdtr = save->gdtr;
+ svm->vmcb01.ptr->save.idtr = save->idtr;
+ svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED;
+ svm->vmcb01.ptr->save.efer = save->efer;
+ svm->vmcb01.ptr->save.cr0 = save->cr0;
+ svm->vmcb01.ptr->save.cr3 = save->cr3;
+ svm->vmcb01.ptr->save.cr4 = save->cr4;
+ svm->vmcb01.ptr->save.rax = save->rax;
+ svm->vmcb01.ptr->save.rsp = save->rsp;
+ svm->vmcb01.ptr->save.rip = save->rip;
+ svm->vmcb01.ptr->save.cpl = 0;
+
+ nested_load_control_from_vmcb12(svm, ctl);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+ nested_vmcb02_prepare_control(svm);
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
ret = 0;
@@ -1254,8 +1336,31 @@ out_free:
return ret;
}
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON(!is_guest_mode(vcpu)))
+ return true;
+
+ if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm)))
+ return false;
+
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ return true;
+}
+
struct kvm_x86_nested_ops svm_nested_ops = {
.check_events = svm_check_nested_events,
+ .triple_fault = nested_svm_triple_fault,
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 214eefb20414..2632852be856 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -50,6 +50,7 @@ static DECLARE_RWSEM(sev_deactivate_lock);
static DEFINE_MUTEX(sev_bitmap_lock);
unsigned int max_sev_asid;
static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
@@ -82,6 +83,11 @@ static int sev_flush_asids(void)
return ret;
}
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+ return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
/* Must be called with the sev_bitmap_lock held */
static bool __sev_recycle_asids(int min_asid, int max_asid)
{
@@ -184,49 +190,41 @@ static void sev_asid_free(struct kvm_sev_info *sev)
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
{
- struct sev_data_decommission *decommission;
- struct sev_data_deactivate *data;
+ struct sev_data_decommission decommission;
+ struct sev_data_deactivate deactivate;
if (!handle)
return;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return;
-
- /* deactivate handle */
- data->handle = handle;
+ deactivate.handle = handle;
/* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
down_read(&sev_deactivate_lock);
- sev_guest_deactivate(data, NULL);
+ sev_guest_deactivate(&deactivate, NULL);
up_read(&sev_deactivate_lock);
- kfree(data);
-
- decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
- if (!decommission)
- return;
-
/* decommission handle */
- decommission->handle = handle;
- sev_guest_decommission(decommission, NULL);
-
- kfree(decommission);
+ decommission.handle = handle;
+ sev_guest_decommission(&decommission, NULL);
}
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ bool es_active = argp->id == KVM_SEV_ES_INIT;
int asid, ret;
+ if (kvm->created_vcpus)
+ return -EINVAL;
+
ret = -EBUSY;
if (unlikely(sev->active))
return ret;
+ sev->es_active = es_active;
asid = sev_asid_new(sev);
if (asid < 0)
- return ret;
+ goto e_no_asid;
sev->asid = asid;
ret = sev_platform_init(&argp->error);
@@ -234,6 +232,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free;
sev->active = true;
+ sev->asid = asid;
INIT_LIST_HEAD(&sev->regions_list);
return 0;
@@ -241,34 +240,21 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
e_free:
sev_asid_free(sev);
sev->asid = 0;
+e_no_asid:
+ sev->es_active = false;
return ret;
}
-static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- if (!sev_es)
- return -ENOTTY;
-
- to_kvm_svm(kvm)->sev_info.es_active = true;
-
- return sev_guest_init(kvm, argp);
-}
-
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
- struct sev_data_activate *data;
+ struct sev_data_activate activate;
int asid = sev_get_asid(kvm);
int ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
/* activate ASID on the given handle */
- data->handle = handle;
- data->asid = asid;
- ret = sev_guest_activate(data, error);
- kfree(data);
+ activate.handle = handle;
+ activate.asid = asid;
+ ret = sev_guest_activate(&activate, error);
return ret;
}
@@ -298,7 +284,7 @@ static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_start *start;
+ struct sev_data_launch_start start;
struct kvm_sev_launch_start params;
void *dh_blob, *session_blob;
int *error = &argp->error;
@@ -310,20 +296,16 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
- if (!start)
- return -ENOMEM;
+ memset(&start, 0, sizeof(start));
dh_blob = NULL;
if (params.dh_uaddr) {
dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
- if (IS_ERR(dh_blob)) {
- ret = PTR_ERR(dh_blob);
- goto e_free;
- }
+ if (IS_ERR(dh_blob))
+ return PTR_ERR(dh_blob);
- start->dh_cert_address = __sme_set(__pa(dh_blob));
- start->dh_cert_len = params.dh_len;
+ start.dh_cert_address = __sme_set(__pa(dh_blob));
+ start.dh_cert_len = params.dh_len;
}
session_blob = NULL;
@@ -334,40 +316,38 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_dh;
}
- start->session_address = __sme_set(__pa(session_blob));
- start->session_len = params.session_len;
+ start.session_address = __sme_set(__pa(session_blob));
+ start.session_len = params.session_len;
}
- start->handle = params.handle;
- start->policy = params.policy;
+ start.handle = params.handle;
+ start.policy = params.policy;
/* create memory encryption context */
- ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
if (ret)
goto e_free_session;
/* Bind ASID to this guest */
- ret = sev_bind_asid(kvm, start->handle, error);
+ ret = sev_bind_asid(kvm, start.handle, error);
if (ret)
goto e_free_session;
/* return handle to userspace */
- params.handle = start->handle;
+ params.handle = start.handle;
if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
- sev_unbind_asid(kvm, start->handle);
+ sev_unbind_asid(kvm, start.handle);
ret = -EFAULT;
goto e_free_session;
}
- sev->handle = start->handle;
+ sev->handle = start.handle;
sev->fd = argp->sev_fd;
e_free_session:
kfree(session_blob);
e_free_dh:
kfree(dh_blob);
-e_free:
- kfree(start);
return ret;
}
@@ -486,7 +466,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
- struct sev_data_launch_update_data *data;
+ struct sev_data_launch_update_data data;
struct page **inpages;
int ret;
@@ -496,20 +476,14 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
vaddr = params.uaddr;
size = params.len;
vaddr_end = vaddr + size;
/* Lock the user memory. */
inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
- if (IS_ERR(inpages)) {
- ret = PTR_ERR(inpages);
- goto e_free;
- }
+ if (IS_ERR(inpages))
+ return PTR_ERR(inpages);
/*
* Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
@@ -517,6 +491,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
*/
sev_clflush_pages(inpages, npages);
+ data.reserved = 0;
+ data.handle = sev->handle;
+
for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
int offset, len;
@@ -531,10 +508,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
- data->handle = sev->handle;
- data->len = len;
- data->address = __sme_page_pa(inpages[i]) + offset;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+ data.len = len;
+ data.address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
if (ret)
goto e_unpin;
@@ -550,8 +526,6 @@ e_unpin:
}
/* unlock the user pages */
sev_unpin_memory(kvm, inpages, npages);
-e_free:
- kfree(data);
return ret;
}
@@ -603,23 +577,22 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_update_vmsa *vmsa;
+ struct sev_data_launch_update_vmsa vmsa;
+ struct kvm_vcpu *vcpu;
int i, ret;
if (!sev_es_guest(kvm))
return -ENOTTY;
- vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL);
- if (!vmsa)
- return -ENOMEM;
+ vmsa.reserved = 0;
- for (i = 0; i < kvm->created_vcpus; i++) {
- struct vcpu_svm *svm = to_svm(kvm->vcpus[i]);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vcpu_svm *svm = to_svm(vcpu);
/* Perform some pre-encryption checks against the VMSA */
ret = sev_es_sync_vmsa(svm);
if (ret)
- goto e_free;
+ return ret;
/*
* The LAUNCH_UPDATE_VMSA command will perform in-place
@@ -629,27 +602,25 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
*/
clflush_cache_range(svm->vmsa, PAGE_SIZE);
- vmsa->handle = sev->handle;
- vmsa->address = __sme_pa(svm->vmsa);
- vmsa->len = PAGE_SIZE;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa,
+ vmsa.handle = sev->handle;
+ vmsa.address = __sme_pa(svm->vmsa);
+ vmsa.len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa,
&argp->error);
if (ret)
- goto e_free;
+ return ret;
svm->vcpu.arch.guest_state_protected = true;
}
-e_free:
- kfree(vmsa);
- return ret;
+ return 0;
}
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = (void __user *)(uintptr_t)argp->data;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_measure *data;
+ struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
void __user *p = NULL;
void *blob = NULL;
@@ -661,9 +632,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, measure, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
@@ -671,23 +640,20 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
p = (void __user *)(uintptr_t)params.uaddr;
if (p) {
- if (params.len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EINVAL;
- goto e_free;
- }
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
- ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
+ blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
- goto e_free;
+ return -ENOMEM;
- data->address = __psp_pa(blob);
- data->len = params.len;
+ data.address = __psp_pa(blob);
+ data.len = params.len;
}
cmd:
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
@@ -704,63 +670,50 @@ cmd:
}
done:
- params.len = data->len;
+ params.len = data.len;
if (copy_to_user(measure, &params, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
return ret;
}
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_finish *data;
- int ret;
+ struct sev_data_launch_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
-
- kfree(data);
- return ret;
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
}
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_guest_status params;
- struct sev_data_guest_status *data;
+ struct sev_data_guest_status data;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
if (ret)
- goto e_free;
+ return ret;
- params.policy = data->policy;
- params.state = data->state;
- params.handle = data->handle;
+ params.policy = data.policy;
+ params.state = data.state;
+ params.handle = data.handle;
if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
ret = -EFAULT;
-e_free:
- kfree(data);
+
return ret;
}
@@ -769,23 +722,17 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
int *error, bool enc)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_dbg *data;
- int ret;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ struct sev_data_dbg data;
- data->handle = sev->handle;
- data->dst_addr = dst;
- data->src_addr = src;
- data->len = size;
+ data.reserved = 0;
+ data.handle = sev->handle;
+ data.dst_addr = dst;
+ data.src_addr = src;
+ data.len = size;
- ret = sev_issue_cmd(kvm,
- enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
- data, error);
- kfree(data);
- return ret;
+ return sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ &data, error);
}
static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
@@ -1005,7 +952,7 @@ err:
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_secret *data;
+ struct sev_data_launch_secret data;
struct kvm_sev_launch_secret params;
struct page **pages;
void *blob, *hdr;
@@ -1037,41 +984,36 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_unpin_memory;
}
- ret = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- goto e_unpin_memory;
+ memset(&data, 0, sizeof(data));
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- data->guest_address = __sme_page_pa(pages[0]) + offset;
- data->guest_len = params.guest_len;
+ data.guest_address = __sme_page_pa(pages[0]) + offset;
+ data.guest_len = params.guest_len;
blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
if (IS_ERR(blob)) {
ret = PTR_ERR(blob);
- goto e_free;
+ goto e_unpin_memory;
}
- data->trans_address = __psp_pa(blob);
- data->trans_len = params.trans_len;
+ data.trans_address = __psp_pa(blob);
+ data.trans_len = params.trans_len;
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
if (IS_ERR(hdr)) {
ret = PTR_ERR(hdr);
goto e_free_blob;
}
- data->hdr_address = __psp_pa(hdr);
- data->hdr_len = params.hdr_len;
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
kfree(hdr);
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
e_unpin_memory:
/* content of memory is updated, mark pages dirty */
for (i = 0; i < n; i++) {
@@ -1086,7 +1028,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *report = (void __user *)(uintptr_t)argp->data;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_attestation_report *data;
+ struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
void __user *p;
void *blob = NULL;
@@ -1098,9 +1040,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
@@ -1108,23 +1048,20 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
p = (void __user *)(uintptr_t)params.uaddr;
if (p) {
- if (params.len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EINVAL;
- goto e_free;
- }
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
- ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
+ blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
- goto e_free;
+ return -ENOMEM;
- data->address = __psp_pa(blob);
- data->len = params.len;
- memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
}
cmd:
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
*/
@@ -1140,16 +1077,411 @@ cmd:
}
done:
- params.len = data->len;
+ params.len = data.len;
if (copy_to_user(report, &params, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
return ret;
}
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_start *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ int ret;
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+ if (ret < 0)
+ return ret;
+
+ params->session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ struct kvm_sev_send_start params;
+ void *amd_certs, *session_data;
+ void *pdh_cert, *plat_certs;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_start)))
+ return -EFAULT;
+
+ /* if session_len is zero, userspace wants to query the session length */
+ if (!params.session_len)
+ return __sev_send_start_query_session_length(kvm, argp,
+ &params);
+
+ /* some sanity checks */
+ if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+ !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ /* allocate the memory to hold the session data blob */
+ session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+ if (!session_data)
+ return -ENOMEM;
+
+ /* copy the certificate blobs from userspace */
+ pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+ params.pdh_cert_len);
+ if (IS_ERR(pdh_cert)) {
+ ret = PTR_ERR(pdh_cert);
+ goto e_free_session;
+ }
+
+ plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+ params.plat_certs_len);
+ if (IS_ERR(plat_certs)) {
+ ret = PTR_ERR(plat_certs);
+ goto e_free_pdh;
+ }
+
+ amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+ params.amd_certs_len);
+ if (IS_ERR(amd_certs)) {
+ ret = PTR_ERR(amd_certs);
+ goto e_free_plat_cert;
+ }
+
+ /* populate the FW SEND_START field with system physical address */
+ memset(&data, 0, sizeof(data));
+ data.pdh_cert_address = __psp_pa(pdh_cert);
+ data.pdh_cert_len = params.pdh_cert_len;
+ data.plat_certs_address = __psp_pa(plat_certs);
+ data.plat_certs_len = params.plat_certs_len;
+ data.amd_certs_address = __psp_pa(amd_certs);
+ data.amd_certs_len = params.amd_certs_len;
+ data.session_address = __psp_pa(session_data);
+ data.session_len = params.session_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ session_data, params.session_len)) {
+ ret = -EFAULT;
+ goto e_free_amd_cert;
+ }
+
+ params.policy = data.policy;
+ params.session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+e_free_amd_cert:
+ kfree(amd_certs);
+e_free_plat_cert:
+ kfree(plat_certs);
+e_free_pdh:
+ kfree(pdh_cert);
+e_free_session:
+ kfree(session_data);
+ return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_update_data *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ int ret;
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+ if (ret < 0)
+ return ret;
+
+ params->hdr_len = data.hdr_len;
+ params->trans_len = data.trans_len;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_update_data)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ struct kvm_sev_send_update_data params;
+ void *hdr, *trans_data;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_update_data)))
+ return -EFAULT;
+
+ /* userspace wants to query either header or trans length */
+ if (!params.trans_len || !params.hdr_len)
+ return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+ if (!params.trans_uaddr || !params.guest_uaddr ||
+ !params.guest_len || !params.hdr_uaddr)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if ((params.guest_len + offset > PAGE_SIZE))
+ return -EINVAL;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (!guest_page)
+ return -EFAULT;
+
+ /* allocate memory for header and transport buffer */
+ ret = -ENOMEM;
+ hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+ if (!hdr)
+ goto e_unpin;
+
+ trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+ if (!trans_data)
+ goto e_free_hdr;
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans_data);
+ data.trans_len = params.trans_len;
+
+ /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ if (ret)
+ goto e_free_trans_data;
+
+ /* copy transport buffer to user space */
+ if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ trans_data, params.trans_len)) {
+ ret = -EFAULT;
+ goto e_free_trans_data;
+ }
+
+ /* Copy packet header to userspace. */
+ ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ params.hdr_len);
+
+e_free_trans_data:
+ kfree(trans_data);
+e_free_hdr:
+ kfree(hdr);
+e_unpin:
+ sev_unpin_memory(kvm, guest_page, n);
+
+ return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_cancel data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_start start;
+ struct kvm_sev_receive_start params;
+ int *error = &argp->error;
+ void *session_data;
+ void *pdh_data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* Get parameter from the userspace */
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_start)))
+ return -EFAULT;
+
+ /* some sanity checks */
+ if (!params.pdh_uaddr || !params.pdh_len ||
+ !params.session_uaddr || !params.session_len)
+ return -EINVAL;
+
+ pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+ if (IS_ERR(pdh_data))
+ return PTR_ERR(pdh_data);
+
+ session_data = psp_copy_user_blob(params.session_uaddr,
+ params.session_len);
+ if (IS_ERR(session_data)) {
+ ret = PTR_ERR(session_data);
+ goto e_free_pdh;
+ }
+
+ memset(&start, 0, sizeof(start));
+ start.handle = params.handle;
+ start.policy = params.policy;
+ start.pdh_cert_address = __psp_pa(pdh_data);
+ start.pdh_cert_len = params.pdh_len;
+ start.session_address = __psp_pa(session_data);
+ start.session_len = params.session_len;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+ error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret)
+ goto e_free_session;
+
+ params.handle = start.handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ &params, sizeof(struct kvm_sev_receive_start))) {
+ ret = -EFAULT;
+ sev_unbind_asid(kvm, start.handle);
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_data);
+e_free_pdh:
+ kfree(pdh_data);
+
+ return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_receive_update_data params;
+ struct sev_data_receive_update_data data;
+ void *hdr = NULL, *trans = NULL;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -EINVAL;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_update_data)))
+ return -EFAULT;
+
+ if (!params.hdr_uaddr || !params.hdr_len ||
+ !params.guest_uaddr || !params.guest_len ||
+ !params.trans_uaddr || !params.trans_len)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if ((params.guest_len + offset > PAGE_SIZE))
+ return -EINVAL;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto e_free_hdr;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans);
+ data.trans_len = params.trans_len;
+
+ /* Pin guest memory */
+ ret = -EFAULT;
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (!guest_page)
+ goto e_free_trans;
+
+ /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+ &argp->error);
+
+ sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+ kfree(trans);
+e_free_hdr:
+ kfree(hdr);
+
+ return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1166,13 +1498,22 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
mutex_lock(&kvm->lock);
+ /* enc_context_owner handles all memory enc operations */
+ if (is_mirroring_enc_context(kvm)) {
+ r = -EINVAL;
+ goto out;
+ }
+
switch (sev_cmd.id) {
+ case KVM_SEV_ES_INIT:
+ if (!sev_es) {
+ r = -ENOTTY;
+ goto out;
+ }
+ fallthrough;
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
- case KVM_SEV_ES_INIT:
- r = sev_es_guest_init(kvm, &sev_cmd);
- break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
@@ -1203,6 +1544,27 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
case KVM_SEV_GET_ATTESTATION_REPORT:
r = sev_get_attestation_report(kvm, &sev_cmd);
break;
+ case KVM_SEV_SEND_START:
+ r = sev_send_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_UPDATE_DATA:
+ r = sev_send_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_FINISH:
+ r = sev_send_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_CANCEL:
+ r = sev_send_cancel(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_START:
+ r = sev_receive_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_UPDATE_DATA:
+ r = sev_receive_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_FINISH:
+ r = sev_receive_finish(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -1226,6 +1588,10 @@ int svm_register_enc_region(struct kvm *kvm,
if (!sev_guest(kvm))
return -ENOTTY;
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;
@@ -1292,6 +1658,10 @@ int svm_unregister_enc_region(struct kvm *kvm,
struct enc_region *region;
int ret;
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
mutex_lock(&kvm->lock);
if (!sev_guest(kvm)) {
@@ -1322,6 +1692,71 @@ failed:
return ret;
}
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct file *source_kvm_file;
+ struct kvm *source_kvm;
+ struct kvm_sev_info *mirror_sev;
+ unsigned int asid;
+ int ret;
+
+ source_kvm_file = fget(source_fd);
+ if (!file_is_kvm(source_kvm_file)) {
+ ret = -EBADF;
+ goto e_source_put;
+ }
+
+ source_kvm = source_kvm_file->private_data;
+ mutex_lock(&source_kvm->lock);
+
+ if (!sev_guest(source_kvm)) {
+ ret = -EINVAL;
+ goto e_source_unlock;
+ }
+
+ /* Mirrors of mirrors should work, but let's not get silly */
+ if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+ ret = -EINVAL;
+ goto e_source_unlock;
+ }
+
+ asid = to_kvm_svm(source_kvm)->sev_info.asid;
+
+ /*
+ * The mirror kvm holds an enc_context_owner ref so its asid can't
+ * disappear until we're done with it
+ */
+ kvm_get_kvm(source_kvm);
+
+ fput(source_kvm_file);
+ mutex_unlock(&source_kvm->lock);
+ mutex_lock(&kvm->lock);
+
+ if (sev_guest(kvm)) {
+ ret = -EINVAL;
+ goto e_mirror_unlock;
+ }
+
+ /* Set enc_context_owner and copy its encryption context over */
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ mirror_sev->enc_context_owner = source_kvm;
+ mirror_sev->asid = asid;
+ mirror_sev->active = true;
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+e_mirror_unlock:
+ mutex_unlock(&kvm->lock);
+ kvm_put_kvm(source_kvm);
+ return ret;
+e_source_unlock:
+ mutex_unlock(&source_kvm->lock);
+e_source_put:
+ fput(source_kvm_file);
+ return ret;
+}
+
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -1331,6 +1766,12 @@ void sev_vm_destroy(struct kvm *kvm)
if (!sev_guest(kvm))
return;
+ /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ if (is_mirroring_enc_context(kvm)) {
+ kvm_put_kvm(sev->enc_context_owner);
+ return;
+ }
+
mutex_lock(&kvm->lock);
/*
@@ -1382,6 +1823,7 @@ void __init sev_hardware_setup(void)
/* Minimum ASID value that should be used for SEV guest */
min_sev_asid = edx;
+ sev_me_mask = 1UL << (ebx & 0x3f);
/* Initialize SEV ASID bitmaps */
sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
@@ -1825,7 +2267,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
len, GHCB_SCRATCH_AREA_LIMIT);
return false;
}
- scratch_va = kzalloc(len, GFP_KERNEL);
+ scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
return false;
@@ -1899,7 +2341,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
vcpu->arch.regs[VCPU_REGS_RCX] = 0;
- ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
if (!ret) {
ret = -EINVAL;
break;
@@ -1949,8 +2391,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
return ret;
}
-int sev_handle_vmgexit(struct vcpu_svm *svm)
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
u64 ghcb_gpa, exit_code;
struct ghcb *ghcb;
@@ -1962,13 +2405,13 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
return sev_handle_vmgexit_msr_protocol(svm);
if (!ghcb_gpa) {
- vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
+ vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
return -EINVAL;
}
- if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+ if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
/* Unable to map GHCB from guest */
- vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
ghcb_gpa);
return -EINVAL;
}
@@ -1976,7 +2419,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
svm->ghcb = svm->ghcb_map.hva;
ghcb = svm->ghcb_map.hva;
- trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
exit_code = ghcb_get_sw_exit_code(ghcb);
@@ -1994,7 +2437,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
break;
- ret = kvm_sev_es_mmio_read(&svm->vcpu,
+ ret = kvm_sev_es_mmio_read(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->ghcb_sa);
@@ -2003,19 +2446,19 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
break;
- ret = kvm_sev_es_mmio_write(&svm->vcpu,
+ ret = kvm_sev_es_mmio_write(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->ghcb_sa);
break;
case SVM_VMGEXIT_NMI_COMPLETE:
- ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
break;
case SVM_VMGEXIT_AP_HLT_LOOP:
- ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+ ret = kvm_emulate_ap_reset_hold(vcpu);
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
- struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
switch (control->exit_info_1) {
case 0:
@@ -2040,12 +2483,12 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
- vcpu_unimpl(&svm->vcpu,
+ vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
break;
default:
- ret = svm_invoke_exit_handler(svm, exit_code);
+ ret = svm_invoke_exit_handler(vcpu, exit_code);
}
return ret;
@@ -2154,5 +2597,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
* the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
* non-zero value.
*/
+ if (!svm->ghcb)
+ return;
+
ghcb_set_sw_exit_info_2(svm->ghcb, 1);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 58a45bb139f8..cd8c333ed2dc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -56,9 +56,6 @@ static const struct x86_cpu_id svm_cpu_id[] = {
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#endif
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
@@ -95,6 +92,8 @@ static const struct svm_direct_access_msrs {
} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
+ { .index = MSR_IA32_SYSENTER_EIP, .always = false },
+ { .index = MSR_IA32_SYSENTER_ESP, .always = false },
#ifdef CONFIG_X86_64
{ .index = MSR_GS_BASE, .always = true },
{ .index = MSR_FS_BASE, .always = true },
@@ -279,7 +278,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
* In this case we will return to the nested guest
* as soon as we leave SMM.
*/
- if (!is_smm(&svm->vcpu))
+ if (!is_smm(vcpu))
svm_free_nested(svm);
} else {
@@ -363,10 +362,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
bool has_error_code = vcpu->arch.exception.has_error_code;
u32 error_code = vcpu->arch.exception.error_code;
- kvm_deliver_exception_payload(&svm->vcpu);
+ kvm_deliver_exception_payload(vcpu);
if (nr == BP_VECTOR && !nrips) {
- unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
/*
* For guest debugging where we have to reinject #BP if some
@@ -375,8 +374,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
- (void)skip_emulated_instruction(&svm->vcpu);
- rip = kvm_rip_read(&svm->vcpu);
+ (void)skip_emulated_instruction(vcpu);
+ rip = kvm_rip_read(vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
}
@@ -681,14 +680,15 @@ void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
u32 *svm_vcpu_alloc_msrpm(void)
{
- struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+ unsigned int order = get_order(MSRPM_SIZE);
+ struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
u32 *msrpm;
if (!pages)
return NULL;
msrpm = page_address(pages);
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
return msrpm;
}
@@ -707,7 +707,7 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
void svm_vcpu_free_msrpm(u32 *msrpm)
{
- __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+ __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}
static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
@@ -881,7 +881,7 @@ static __init void svm_adjust_mmio_mask(void)
*/
mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
- kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}
static void svm_hardware_teardown(void)
@@ -894,7 +894,8 @@ static void svm_hardware_teardown(void)
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
+ get_order(IOPM_SIZE));
iopm_base = 0;
}
@@ -930,14 +931,15 @@ static __init int svm_hardware_setup(void)
struct page *iopm_pages;
void *iopm_va;
int r;
+ unsigned int order = get_order(IOPM_SIZE);
- iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+ iopm_pages = alloc_pages(GFP_KERNEL, order);
if (!iopm_pages)
return -ENOMEM;
iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
init_msrpm_offsets();
@@ -1084,8 +1086,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
if (is_guest_mode(vcpu)) {
/* Write L1's TSC offset. */
g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
- svm->nested.hsave->control.tsc_offset = offset;
+ svm->vmcb01.ptr->control.tsc_offset;
+ svm->vmcb01.ptr->control.tsc_offset = offset;
}
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1113,12 +1115,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
}
}
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
- svm->vcpu.arch.hflags = 0;
+ vcpu->arch.hflags = 0;
svm_set_intercept(svm, INTERCEPT_CR0_READ);
svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1126,7 +1129,7 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
- if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ if (!kvm_vcpu_apicv_active(vcpu))
svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
set_dr_intercepts(svm);
@@ -1170,12 +1173,12 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, INTERCEPT_RDPRU);
svm_set_intercept(svm, INTERCEPT_RSM);
- if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+ if (!kvm_mwait_in_guest(vcpu->kvm)) {
svm_set_intercept(svm, INTERCEPT_MONITOR);
svm_set_intercept(svm, INTERCEPT_MWAIT);
}
- if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+ if (!kvm_hlt_in_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_HLT);
control->iopm_base_pa = __sme_set(iopm_base);
@@ -1201,19 +1204,19 @@ static void init_vmcb(struct vcpu_svm *svm)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_cr4(&svm->vcpu, 0);
- svm_set_efer(&svm->vcpu, 0);
+ svm_set_cr4(vcpu, 0);
+ svm_set_efer(vcpu, 0);
save->dr6 = 0xffff0ff0;
- kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
save->rip = 0x0000fff0;
- svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+ vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
/*
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
* It also updates the guest-visible cr0 value.
*/
- svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
- kvm_mmu_reset_context(&svm->vcpu);
+ svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+ kvm_mmu_reset_context(vcpu);
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
@@ -1225,17 +1228,18 @@ static void init_vmcb(struct vcpu_svm *svm)
clr_exception_intercept(svm, PF_VECTOR);
svm_clr_intercept(svm, INTERCEPT_CR3_READ);
svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
- save->g_pat = svm->vcpu.arch.pat;
+ save->g_pat = vcpu->arch.pat;
save->cr3 = 0;
save->cr4 = 0;
}
- svm->asid_generation = 0;
+ svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
svm->nested.vmcb12_gpa = 0;
- svm->vcpu.arch.hflags = 0;
+ svm->nested.last_vmcb12_gpa = 0;
+ vcpu->arch.hflags = 0;
- if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
control->pause_filter_count = pause_filter_count;
if (pause_filter_thresh)
control->pause_filter_thresh = pause_filter_thresh;
@@ -1246,18 +1250,15 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_check_invpcid(svm);
- if (kvm_vcpu_apicv_active(&svm->vcpu))
- avic_init_vmcb(svm);
-
/*
- * If hardware supports Virtual VMLOAD VMSAVE then enable it
- * in VMCB and clear intercepts to avoid #VMEXIT.
+ * If the host supports V_SPEC_CTRL then disable the interception
+ * of MSR_IA32_SPEC_CTRL.
*/
- if (vls) {
- svm_clr_intercept(svm, INTERCEPT_VMLOAD);
- svm_clr_intercept(svm, INTERCEPT_VMSAVE);
- svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
- }
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_init_vmcb(svm);
if (vgif) {
svm_clr_intercept(svm, INTERCEPT_STGI);
@@ -1265,11 +1266,11 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
- if (sev_guest(svm->vcpu.kvm)) {
+ if (sev_guest(vcpu->kvm)) {
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
clr_exception_intercept(svm, UD_VECTOR);
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/* Perform SEV-ES specific VMCB updates */
sev_es_init_vmcb(svm);
}
@@ -1291,12 +1292,12 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
svm->virt_spec_ctrl = 0;
if (!init_event) {
- svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
}
- init_vmcb(svm);
+ init_vmcb(vcpu);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
kvm_rdx_write(vcpu, eax);
@@ -1305,10 +1306,16 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
}
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+ svm->current_vmcb = target_vmcb;
+ svm->vmcb = target_vmcb->ptr;
+}
+
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
- struct page *vmcb_page;
+ struct page *vmcb01_page;
struct page *vmsa_page = NULL;
int err;
@@ -1316,11 +1323,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
err = -ENOMEM;
- vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!vmcb_page)
+ vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb01_page)
goto out;
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/*
* SEV-ES guests require a separate VMSA page used to contain
* the encrypted register state of the guest.
@@ -1356,20 +1363,21 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm_vcpu_init_msrpm(vcpu, svm->msrpm);
- svm->vmcb = page_address(vmcb_page);
- svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+ svm->vmcb01.ptr = page_address(vmcb01_page);
+ svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
if (vmsa_page)
svm->vmsa = page_address(vmsa_page);
- svm->asid_generation = 0;
svm->guest_state_loaded = false;
- init_vmcb(svm);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+ init_vmcb(vcpu);
svm_init_osvw(vcpu);
vcpu->arch.microcode_version = 0x01000065;
- if (sev_es_guest(svm->vcpu.kvm))
+ if (sev_es_guest(vcpu->kvm))
/* Perform SEV-ES specific VMCB creation updates */
sev_es_create_vcpu(svm);
@@ -1379,7 +1387,7 @@ error_free_vmsa_page:
if (vmsa_page)
__free_page(vmsa_page);
error_free_vmcb_page:
- __free_page(vmcb_page);
+ __free_page(vmcb01_page);
out:
return err;
}
@@ -1407,8 +1415,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
- __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
- __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -1432,7 +1440,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
* Save additional host state that will be restored on VMEXIT (sev-es)
* or subsequent vmload of host save area.
*/
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
sev_es_prepare_guest_switch(svm, vcpu->cpu);
} else {
vmsave(__sme_page_pa(sd->save_area));
@@ -1476,11 +1484,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- if (unlikely(cpu != vcpu->cpu)) {
- svm->asid_generation = 0;
- vmcb_mark_all_dirty(svm->vmcb);
- }
-
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
indirect_branch_prediction_barrier();
@@ -1564,7 +1567,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
/* Drop int_ctl fields related to VINTR injection. */
svm->vmcb->control.int_ctl &= mask;
if (is_guest_mode(&svm->vcpu)) {
- svm->nested.hsave->control.int_ctl &= mask;
+ svm->vmcb01.ptr->control.int_ctl &= mask;
WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
(svm->nested.ctl.int_ctl & V_TPR_MASK));
@@ -1577,16 +1580,17 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+ struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
switch (seg) {
case VCPU_SREG_CS: return &save->cs;
case VCPU_SREG_DS: return &save->ds;
case VCPU_SREG_ES: return &save->es;
- case VCPU_SREG_FS: return &save->fs;
- case VCPU_SREG_GS: return &save->gs;
+ case VCPU_SREG_FS: return &save01->fs;
+ case VCPU_SREG_GS: return &save01->gs;
case VCPU_SREG_SS: return &save->ss;
- case VCPU_SREG_TR: return &save->tr;
- case VCPU_SREG_LDTR: return &save->ldtr;
+ case VCPU_SREG_TR: return &save01->tr;
+ case VCPU_SREG_LDTR: return &save01->ldtr;
}
BUG();
return NULL;
@@ -1709,37 +1713,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
-static void update_cr0_intercept(struct vcpu_svm *svm)
-{
- ulong gcr0;
- u64 *hcr0;
-
- /*
- * SEV-ES guests must always keep the CR intercepts cleared. CR
- * tracking is done using the CR write traps.
- */
- if (sev_es_guest(svm->vcpu.kvm))
- return;
-
- gcr0 = svm->vcpu.arch.cr0;
- hcr0 = &svm->vmcb->save.cr0;
- *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
- | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
- vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-
- if (gcr0 == *hcr0) {
- svm_clr_intercept(svm, INTERCEPT_CR0_READ);
- svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
- } else {
- svm_set_intercept(svm, INTERCEPT_CR0_READ);
- svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
- }
-}
-
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u64 hcr0 = cr0;
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1757,7 +1734,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vcpu->arch.cr0 = cr0;
if (!npt_enabled)
- cr0 |= X86_CR0_PG | X86_CR0_WP;
+ hcr0 |= X86_CR0_PG | X86_CR0_WP;
/*
* re-enable caching here because the QEMU bios
@@ -1765,10 +1742,26 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* reboot
*/
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
- svm->vmcb->save.cr0 = cr0;
+ hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+ svm->vmcb->save.cr0 = hcr0;
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
- update_cr0_intercept(svm);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (hcr0 == cr0) {
+ /* Selective CR0 write remains on. */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
}
static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1847,7 +1840,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
- svm->asid_generation = sd->asid_generation;
+ svm->current_vmcb->asid_generation = sd->asid_generation;
svm->asid = sd->next_asid++;
}
@@ -1896,39 +1889,43 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
-static int pf_interception(struct vcpu_svm *svm)
+static int pf_interception(struct kvm_vcpu *vcpu)
{
- u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
- return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+ return kvm_handle_page_fault(vcpu, error_code, fault_address,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
-static int npf_interception(struct vcpu_svm *svm)
+static int npf_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
u64 error_code = svm->vmcb->control.exit_info_1;
trace_kvm_page_fault(fault_address, error_code);
- return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ return kvm_mmu_page_fault(vcpu, fault_address, error_code,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
-static int db_interception(struct vcpu_svm *svm)
+static int db_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
- if (!(svm->vcpu.guest_debug &
+ if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
!svm->nmi_singlestep) {
u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
- kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
+ kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
return 1;
}
@@ -1938,7 +1935,7 @@ static int db_interception(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
- if (svm->vcpu.guest_debug &
+ if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
@@ -1952,9 +1949,10 @@ static int db_interception(struct vcpu_svm *svm)
return 1;
}
-static int bp_interception(struct vcpu_svm *svm)
+static int bp_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1962,14 +1960,14 @@ static int bp_interception(struct vcpu_svm *svm)
return 0;
}
-static int ud_interception(struct vcpu_svm *svm)
+static int ud_interception(struct kvm_vcpu *vcpu)
{
- return handle_ud(&svm->vcpu);
+ return handle_ud(vcpu);
}
-static int ac_interception(struct vcpu_svm *svm)
+static int ac_interception(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
return 1;
}
@@ -2012,7 +2010,7 @@ static bool is_erratum_383(void)
return true;
}
-static void svm_handle_mce(struct vcpu_svm *svm)
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
{
if (is_erratum_383()) {
/*
@@ -2021,7 +2019,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
*/
pr_err("KVM: Guest triggered AMD Erratum 383\n");
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -2033,20 +2031,21 @@ static void svm_handle_mce(struct vcpu_svm *svm)
kvm_machine_check();
}
-static int mc_interception(struct vcpu_svm *svm)
+static int mc_interception(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int shutdown_interception(struct vcpu_svm *svm)
+static int shutdown_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
/*
* The VM save area has already been encrypted so it
* cannot be reinitialized - just terminate.
*/
- if (sev_es_guest(svm->vcpu.kvm))
+ if (sev_es_guest(vcpu->kvm))
return -EINVAL;
/*
@@ -2054,20 +2053,20 @@ static int shutdown_interception(struct vcpu_svm *svm)
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm);
+ init_vmcb(vcpu);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
}
-static int io_interception(struct vcpu_svm *svm)
+static int io_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
unsigned port;
- ++svm->vcpu.stat.io_exits;
+ ++vcpu->stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
port = io_info >> 16;
@@ -2082,93 +2081,69 @@ static int io_interception(struct vcpu_svm *svm)
svm->next_rip = svm->vmcb->control.exit_info_2;
- return kvm_fast_pio(&svm->vcpu, size, port, in);
-}
-
-static int nmi_interception(struct vcpu_svm *svm)
-{
- return 1;
+ return kvm_fast_pio(vcpu, size, port, in);
}
-static int intr_interception(struct vcpu_svm *svm)
+static int nmi_interception(struct kvm_vcpu *vcpu)
{
- ++svm->vcpu.stat.irq_exits;
return 1;
}
-static int nop_on_interception(struct vcpu_svm *svm)
+static int intr_interception(struct kvm_vcpu *vcpu)
{
+ ++vcpu->stat.irq_exits;
return 1;
}
-static int halt_interception(struct vcpu_svm *svm)
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
{
- return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_hypercall(&svm->vcpu);
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb12;
struct kvm_host_map map;
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
if (ret) {
if (ret == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
- nested_vmcb = map.hva;
+ vmcb12 = map.hva;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ ret = kvm_skip_emulated_instruction(vcpu);
- nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ if (vmload) {
+ nested_svm_vmloadsave(vmcb12, svm->vmcb);
+ svm->sysenter_eip_hi = 0;
+ svm->sysenter_esp_hi = 0;
+ } else
+ nested_svm_vmloadsave(svm->vmcb, vmcb12);
+
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
-static int vmsave_interception(struct vcpu_svm *svm)
+static int vmload_interception(struct kvm_vcpu *vcpu)
{
- struct vmcb *nested_vmcb;
- struct kvm_host_map map;
- int ret;
-
- if (nested_svm_check_permissions(svm))
- return 1;
-
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
- if (ret) {
- if (ret == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- }
-
- nested_vmcb = map.hva;
-
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
- nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ return vmload_vmsave_interception(vcpu, true);
+}
- return ret;
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, false);
}
-static int vmrun_interception(struct vcpu_svm *svm)
+static int vmrun_interception(struct kvm_vcpu *vcpu)
{
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- return nested_svm_vmrun(svm);
+ return nested_svm_vmrun(vcpu);
}
enum {
@@ -2207,7 +2182,7 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
[SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
[SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
};
- int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+ int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_INSTR_VMRUN] = vmrun_interception,
[SVM_INSTR_VMLOAD] = vmload_interception,
[SVM_INSTR_VMSAVE] = vmsave_interception,
@@ -2216,17 +2191,13 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
int ret;
if (is_guest_mode(vcpu)) {
- svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
/* Returns '1' or -errno on failure, '0' on success. */
- ret = nested_svm_vmexit(svm);
+ ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
if (ret)
return ret;
return 1;
}
- return svm_instr_handlers[opcode](svm);
+ return svm_instr_handlers[opcode](vcpu);
}
/*
@@ -2237,9 +2208,9 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
* regions (e.g. SMM memory on host).
* 2) VMware backdoor
*/
-static int gp_interception(struct vcpu_svm *svm)
+static int gp_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 error_code = svm->vmcb->control.exit_info_1;
int opcode;
@@ -2304,73 +2275,52 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
}
}
-static int stgi_interception(struct vcpu_svm *svm)
+static int stgi_interception(struct kvm_vcpu *vcpu)
{
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- svm_set_gif(svm, true);
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), true);
return ret;
}
-static int clgi_interception(struct vcpu_svm *svm)
+static int clgi_interception(struct kvm_vcpu *vcpu)
{
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- svm_set_gif(svm, false);
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), false);
return ret;
}
-static int invlpga_interception(struct vcpu_svm *svm)
+static int invlpga_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
-
- trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
- kvm_rax_read(&svm->vcpu));
+ trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, kvm_rcx_read(vcpu),
+ kvm_rax_read(vcpu));
/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
- kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
-
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+ kvm_mmu_invlpg(vcpu, kvm_rax_read(vcpu));
-static int skinit_interception(struct vcpu_svm *svm)
-{
- trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
-
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static int wbinvd_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_wbinvd(&svm->vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int xsetbv_interception(struct vcpu_svm *svm)
+static int skinit_interception(struct kvm_vcpu *vcpu)
{
- u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
- u32 index = kvm_rcx_read(&svm->vcpu);
+ trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
- int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
- return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static int rdpru_interception(struct vcpu_svm *svm)
-{
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
-static int task_switch_interception(struct vcpu_svm *svm)
+static int task_switch_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u16 tss_selector;
int reason;
int int_type = svm->vmcb->control.exit_int_info &
@@ -2399,7 +2349,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
if (reason == TASK_SWITCH_GATE) {
switch (type) {
case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = false;
+ vcpu->arch.nmi_injected = false;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
if (svm->vmcb->control.exit_info_2 &
@@ -2408,10 +2358,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
error_code =
(u32)svm->vmcb->control.exit_info_2;
}
- kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
break;
case SVM_EXITINTINFO_TYPE_INTR:
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(vcpu);
break;
default:
break;
@@ -2422,77 +2372,58 @@ static int task_switch_interception(struct vcpu_svm *svm)
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
- if (!skip_emulated_instruction(&svm->vcpu))
+ if (!skip_emulated_instruction(vcpu))
return 0;
}
if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
int_vec = -1;
- return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+ return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
has_error_code, error_code);
}
-static int cpuid_interception(struct vcpu_svm *svm)
+static int iret_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_cpuid(&svm->vcpu);
-}
+ struct vcpu_svm *svm = to_svm(vcpu);
-static int iret_interception(struct vcpu_svm *svm)
-{
- ++svm->vcpu.stat.nmi_window_exits;
- svm->vcpu.arch.hflags |= HF_IRET_MASK;
- if (!sev_es_guest(svm->vcpu.kvm)) {
+ ++vcpu->stat.nmi_window_exits;
+ vcpu->arch.hflags |= HF_IRET_MASK;
+ if (!sev_es_guest(vcpu->kvm)) {
svm_clr_intercept(svm, INTERCEPT_IRET);
- svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
}
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
-static int invd_interception(struct vcpu_svm *svm)
-{
- /* Treat an INVD instruction as a NOP and just skip it. */
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
+static int invlpg_interception(struct kvm_vcpu *vcpu)
{
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return kvm_emulate_instruction(&svm->vcpu, 0);
-
- kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+ return kvm_emulate_instruction(vcpu, 0);
-static int emulate_on_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_instruction(&svm->vcpu, 0);
+ kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int rsm_interception(struct vcpu_svm *svm)
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
+ return kvm_emulate_instruction(vcpu, 0);
}
-static int rdpmc_interception(struct vcpu_svm *svm)
+static int rsm_interception(struct kvm_vcpu *vcpu)
{
- int err;
-
- if (!nrips)
- return emulate_on_interception(svm);
-
- err = kvm_rdpmc(&svm->vcpu);
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
}
-static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
unsigned long val)
{
- unsigned long cr0 = svm->vcpu.arch.cr0;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr0 = vcpu->arch.cr0;
bool ret = false;
- if (!is_guest_mode(&svm->vcpu) ||
+ if (!is_guest_mode(vcpu) ||
(!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
return false;
@@ -2509,17 +2440,18 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
#define CR_VALID (1ULL << 63)
-static int cr_interception(struct vcpu_svm *svm)
+static int cr_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int reg, cr;
unsigned long val;
int err;
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
@@ -2530,61 +2462,61 @@ static int cr_interception(struct vcpu_svm *svm)
err = 0;
if (cr >= 16) { /* mov to cr */
cr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
+ val = kvm_register_read(vcpu, reg);
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- if (!check_selective_cr0_intercepted(svm, val))
- err = kvm_set_cr0(&svm->vcpu, val);
+ if (!check_selective_cr0_intercepted(vcpu, val))
+ err = kvm_set_cr0(vcpu, val);
else
return 1;
break;
case 3:
- err = kvm_set_cr3(&svm->vcpu, val);
+ err = kvm_set_cr3(vcpu, val);
break;
case 4:
- err = kvm_set_cr4(&svm->vcpu, val);
+ err = kvm_set_cr4(vcpu, val);
break;
case 8:
- err = kvm_set_cr8(&svm->vcpu, val);
+ err = kvm_set_cr8(vcpu, val);
break;
default:
WARN(1, "unhandled write to CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
} else { /* mov from cr */
switch (cr) {
case 0:
- val = kvm_read_cr0(&svm->vcpu);
+ val = kvm_read_cr0(vcpu);
break;
case 2:
- val = svm->vcpu.arch.cr2;
+ val = vcpu->arch.cr2;
break;
case 3:
- val = kvm_read_cr3(&svm->vcpu);
+ val = kvm_read_cr3(vcpu);
break;
case 4:
- val = kvm_read_cr4(&svm->vcpu);
+ val = kvm_read_cr4(vcpu);
break;
case 8:
- val = kvm_get_cr8(&svm->vcpu);
+ val = kvm_get_cr8(vcpu);
break;
default:
WARN(1, "unhandled read from CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
}
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ return kvm_complete_insn_gp(vcpu, err);
}
-static int cr_trap(struct vcpu_svm *svm)
+static int cr_trap(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
unsigned long old_value, new_value;
unsigned int cr;
int ret = 0;
@@ -2606,7 +2538,7 @@ static int cr_trap(struct vcpu_svm *svm)
kvm_post_set_cr4(vcpu, old_value, new_value);
break;
case 8:
- ret = kvm_set_cr8(&svm->vcpu, new_value);
+ ret = kvm_set_cr8(vcpu, new_value);
break;
default:
WARN(1, "unhandled CR%d write trap", cr);
@@ -2617,57 +2549,57 @@ static int cr_trap(struct vcpu_svm *svm)
return kvm_complete_insn_gp(vcpu, ret);
}
-static int dr_interception(struct vcpu_svm *svm)
+static int dr_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int reg, dr;
unsigned long val;
int err = 0;
- if (svm->vcpu.guest_debug == 0) {
+ if (vcpu->guest_debug == 0) {
/*
* No more DR vmexits; force a reload of the debug registers
* and reenter on this instruction. The next vmexit will
* retrieve the full state of the debug registers.
*/
clr_dr_intercepts(svm);
- svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
return 1;
}
if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
if (dr >= 16) { /* mov to DRn */
dr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
- err = kvm_set_dr(&svm->vcpu, dr, val);
+ val = kvm_register_read(vcpu, reg);
+ err = kvm_set_dr(vcpu, dr, val);
} else {
- kvm_get_dr(&svm->vcpu, dr, &val);
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_get_dr(vcpu, dr, &val);
+ kvm_register_write(vcpu, reg, val);
}
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ return kvm_complete_insn_gp(vcpu, err);
}
-static int cr8_write_interception(struct vcpu_svm *svm)
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
int r;
- u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+ u8 cr8_prev = kvm_get_cr8(vcpu);
/* instruction emulation calls kvm_set_cr8() */
- r = cr_interception(svm);
- if (lapic_in_kernel(&svm->vcpu))
+ r = cr_interception(vcpu);
+ if (lapic_in_kernel(vcpu))
return r;
- if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+ if (cr8_prev <= kvm_get_cr8(vcpu))
return r;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
-static int efer_trap(struct vcpu_svm *svm)
+static int efer_trap(struct kvm_vcpu *vcpu)
{
struct msr_data msr_info;
int ret;
@@ -2680,10 +2612,10 @@ static int efer_trap(struct vcpu_svm *svm)
*/
msr_info.host_initiated = false;
msr_info.index = MSR_EFER;
- msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME;
- ret = kvm_set_msr_common(&svm->vcpu, &msr_info);
+ msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+ ret = kvm_set_msr_common(vcpu, &msr_info);
- return kvm_complete_insn_gp(&svm->vcpu, ret);
+ return kvm_complete_insn_gp(vcpu, ret);
}
static int svm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -2710,30 +2642,34 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_STAR:
- msr_info->data = svm->vmcb->save.star;
+ msr_info->data = svm->vmcb01.ptr->save.star;
break;
#ifdef CONFIG_X86_64
case MSR_LSTAR:
- msr_info->data = svm->vmcb->save.lstar;
+ msr_info->data = svm->vmcb01.ptr->save.lstar;
break;
case MSR_CSTAR:
- msr_info->data = svm->vmcb->save.cstar;
+ msr_info->data = svm->vmcb01.ptr->save.cstar;
break;
case MSR_KERNEL_GS_BASE:
- msr_info->data = svm->vmcb->save.kernel_gs_base;
+ msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
break;
case MSR_SYSCALL_MASK:
- msr_info->data = svm->vmcb->save.sfmask;
+ msr_info->data = svm->vmcb01.ptr->save.sfmask;
break;
#endif
case MSR_IA32_SYSENTER_CS:
- msr_info->data = svm->vmcb->save.sysenter_cs;
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
- msr_info->data = svm->sysenter_eip;
+ msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
break;
case MSR_IA32_SYSENTER_ESP:
- msr_info->data = svm->sysenter_esp;
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break;
case MSR_TSC_AUX:
if (!boot_cpu_has(X86_FEATURE_RDTSCP))
@@ -2771,7 +2707,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_has_spec_ctrl_msr(vcpu))
return 1;
- msr_info->data = svm->spec_ctrl;
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ msr_info->data = svm->vmcb->save.spec_ctrl;
+ else
+ msr_info->data = svm->spec_ctrl;
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr_info->host_initiated &&
@@ -2809,8 +2748,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (!sev_es_guest(svm->vcpu.kvm) || !err)
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb))
+ return kvm_complete_insn_gp(vcpu, err);
ghcb_set_sw_exit_info_1(svm->ghcb, 1);
ghcb_set_sw_exit_info_2(svm->ghcb,
@@ -2820,11 +2759,6 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
return 1;
}
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_rdmsr(&svm->vcpu);
-}
-
static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2861,7 +2795,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
return 1;
vcpu->arch.pat = data;
- svm->vmcb->save.g_pat = data;
+ svm->vmcb01.ptr->save.g_pat = data;
+ if (is_guest_mode(vcpu))
+ nested_vmcb02_compute_g_pat(svm);
vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
break;
case MSR_IA32_SPEC_CTRL:
@@ -2872,7 +2808,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (kvm_spec_ctrl_test_value(data))
return 1;
- svm->spec_ctrl = data;
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ svm->vmcb->save.spec_ctrl = data;
+ else
+ svm->spec_ctrl = data;
if (!data)
break;
@@ -2915,32 +2854,39 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->virt_spec_ctrl = data;
break;
case MSR_STAR:
- svm->vmcb->save.star = data;
+ svm->vmcb01.ptr->save.star = data;
break;
#ifdef CONFIG_X86_64
case MSR_LSTAR:
- svm->vmcb->save.lstar = data;
+ svm->vmcb01.ptr->save.lstar = data;
break;
case MSR_CSTAR:
- svm->vmcb->save.cstar = data;
+ svm->vmcb01.ptr->save.cstar = data;
break;
case MSR_KERNEL_GS_BASE:
- svm->vmcb->save.kernel_gs_base = data;
+ svm->vmcb01.ptr->save.kernel_gs_base = data;
break;
case MSR_SYSCALL_MASK:
- svm->vmcb->save.sfmask = data;
+ svm->vmcb01.ptr->save.sfmask = data;
break;
#endif
case MSR_IA32_SYSENTER_CS:
- svm->vmcb->save.sysenter_cs = data;
+ svm->vmcb01.ptr->save.sysenter_cs = data;
break;
case MSR_IA32_SYSENTER_EIP:
- svm->sysenter_eip = data;
- svm->vmcb->save.sysenter_eip = data;
+ svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
+ /*
+ * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
+ * when we spoof an Intel vendor ID (for cross vendor migration).
+ * In this case we use this intercept to track the high
+ * 32 bit part of these msrs to support Intel's
+ * implementation of SYSENTER/SYSEXIT.
+ */
+ svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
break;
case MSR_IA32_SYSENTER_ESP:
- svm->sysenter_esp = data;
- svm->vmcb->save.sysenter_esp = data;
+ svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
break;
case MSR_TSC_AUX:
if (!boot_cpu_has(X86_FEATURE_RDTSCP))
@@ -3006,38 +2952,32 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
return 0;
}
-static int wrmsr_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_wrmsr(&svm->vcpu);
-}
-
-static int msr_interception(struct vcpu_svm *svm)
+static int msr_interception(struct kvm_vcpu *vcpu)
{
- if (svm->vmcb->control.exit_info_1)
- return wrmsr_interception(svm);
+ if (to_svm(vcpu)->vmcb->control.exit_info_1)
+ return kvm_emulate_wrmsr(vcpu);
else
- return rdmsr_interception(svm);
+ return kvm_emulate_rdmsr(vcpu);
}
-static int interrupt_window_interception(struct vcpu_svm *svm)
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
- svm_clear_vintr(svm);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ svm_clear_vintr(to_svm(vcpu));
/*
* For AVIC, the only reason to end up here is ExtINTs.
* In this case AVIC was temporarily disabled for
* requesting the IRQ window and we have to re-enable it.
*/
- svm_toggle_avic_for_irq_window(&svm->vcpu, true);
+ svm_toggle_avic_for_irq_window(vcpu, true);
- ++svm->vcpu.stat.irq_window_exits;
+ ++vcpu->stat.irq_window_exits;
return 1;
}
-static int pause_interception(struct vcpu_svm *svm)
+static int pause_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
bool in_kernel;
/*
@@ -3045,35 +2985,18 @@ static int pause_interception(struct vcpu_svm *svm)
* vcpu->arch.preempted_in_kernel can never be true. Just
* set in_kernel to false as well.
*/
- in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0;
+ in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
if (!kvm_pause_in_guest(vcpu->kvm))
grow_ple_window(vcpu);
kvm_vcpu_on_spin(vcpu, in_kernel);
- return 1;
-}
-
-static int nop_interception(struct vcpu_svm *svm)
-{
- return kvm_skip_emulated_instruction(&(svm->vcpu));
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int monitor_interception(struct vcpu_svm *svm)
+static int invpcid_interception(struct kvm_vcpu *vcpu)
{
- printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
- return nop_interception(svm);
-}
-
-static int mwait_interception(struct vcpu_svm *svm)
-{
- printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
- return nop_interception(svm);
-}
-
-static int invpcid_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
unsigned long type;
gva_t gva;
@@ -3098,7 +3021,7 @@ static int invpcid_interception(struct vcpu_svm *svm)
return kvm_handle_invpcid(vcpu, type, gva);
}
-static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
[SVM_EXIT_READ_CR4] = cr_interception,
@@ -3133,15 +3056,15 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
[SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
- [SVM_EXIT_SMI] = nop_on_interception,
- [SVM_EXIT_INIT] = nop_on_interception,
+ [SVM_EXIT_SMI] = kvm_emulate_as_nop,
+ [SVM_EXIT_INIT] = kvm_emulate_as_nop,
[SVM_EXIT_VINTR] = interrupt_window_interception,
- [SVM_EXIT_RDPMC] = rdpmc_interception,
- [SVM_EXIT_CPUID] = cpuid_interception,
+ [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
+ [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
[SVM_EXIT_IRET] = iret_interception,
- [SVM_EXIT_INVD] = invd_interception,
+ [SVM_EXIT_INVD] = kvm_emulate_invd,
[SVM_EXIT_PAUSE] = pause_interception,
- [SVM_EXIT_HLT] = halt_interception,
+ [SVM_EXIT_HLT] = kvm_emulate_halt,
[SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invlpga_interception,
[SVM_EXIT_IOIO] = io_interception,
@@ -3149,17 +3072,17 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
[SVM_EXIT_SHUTDOWN] = shutdown_interception,
[SVM_EXIT_VMRUN] = vmrun_interception,
- [SVM_EXIT_VMMCALL] = vmmcall_interception,
+ [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
[SVM_EXIT_VMLOAD] = vmload_interception,
[SVM_EXIT_VMSAVE] = vmsave_interception,
[SVM_EXIT_STGI] = stgi_interception,
[SVM_EXIT_CLGI] = clgi_interception,
[SVM_EXIT_SKINIT] = skinit_interception,
- [SVM_EXIT_WBINVD] = wbinvd_interception,
- [SVM_EXIT_MONITOR] = monitor_interception,
- [SVM_EXIT_MWAIT] = mwait_interception,
- [SVM_EXIT_XSETBV] = xsetbv_interception,
- [SVM_EXIT_RDPRU] = rdpru_interception,
+ [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
+ [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
+ [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
+ [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
+ [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
[SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
[SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
[SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
@@ -3177,6 +3100,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
+ struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
@@ -3239,28 +3163,28 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
save->ds.limit, save->ds.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"fs:",
- save->fs.selector, save->fs.attrib,
- save->fs.limit, save->fs.base);
+ save01->fs.selector, save01->fs.attrib,
+ save01->fs.limit, save01->fs.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"gs:",
- save->gs.selector, save->gs.attrib,
- save->gs.limit, save->gs.base);
+ save01->gs.selector, save01->gs.attrib,
+ save01->gs.limit, save01->gs.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"gdtr:",
save->gdtr.selector, save->gdtr.attrib,
save->gdtr.limit, save->gdtr.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"ldtr:",
- save->ldtr.selector, save->ldtr.attrib,
- save->ldtr.limit, save->ldtr.base);
+ save01->ldtr.selector, save01->ldtr.attrib,
+ save01->ldtr.limit, save01->ldtr.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"idtr:",
save->idtr.selector, save->idtr.attrib,
save->idtr.limit, save->idtr.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"tr:",
- save->tr.selector, save->tr.attrib,
- save->tr.limit, save->tr.base);
+ save01->tr.selector, save01->tr.attrib,
+ save01->tr.limit, save01->tr.base);
pr_err("cpl: %d efer: %016llx\n",
save->cpl, save->efer);
pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3274,15 +3198,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-15s %016llx %-13s %016llx\n",
"rsp:", save->rsp, "rax:", save->rax);
pr_err("%-15s %016llx %-13s %016llx\n",
- "star:", save->star, "lstar:", save->lstar);
+ "star:", save01->star, "lstar:", save01->lstar);
pr_err("%-15s %016llx %-13s %016llx\n",
- "cstar:", save->cstar, "sfmask:", save->sfmask);
+ "cstar:", save01->cstar, "sfmask:", save01->sfmask);
pr_err("%-15s %016llx %-13s %016llx\n",
- "kernel_gs_base:", save->kernel_gs_base,
- "sysenter_cs:", save->sysenter_cs);
+ "kernel_gs_base:", save01->kernel_gs_base,
+ "sysenter_cs:", save01->sysenter_cs);
pr_err("%-15s %016llx %-13s %016llx\n",
- "sysenter_esp:", save->sysenter_esp,
- "sysenter_eip:", save->sysenter_eip);
+ "sysenter_esp:", save01->sysenter_esp,
+ "sysenter_eip:", save01->sysenter_eip);
pr_err("%-15s %016llx %-13s %016llx\n",
"gpat:", save->g_pat, "dbgctl:", save->dbgctl);
pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3309,24 +3233,24 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
return -EINVAL;
}
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code)
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (svm_handle_invalid_exit(&svm->vcpu, exit_code))
+ if (svm_handle_invalid_exit(vcpu, exit_code))
return 0;
#ifdef CONFIG_RETPOLINE
if (exit_code == SVM_EXIT_MSR)
- return msr_interception(svm);
+ return msr_interception(vcpu);
else if (exit_code == SVM_EXIT_VINTR)
- return interrupt_window_interception(svm);
+ return interrupt_window_interception(vcpu);
else if (exit_code == SVM_EXIT_INTR)
- return intr_interception(svm);
+ return intr_interception(vcpu);
else if (exit_code == SVM_EXIT_HLT)
- return halt_interception(svm);
+ return kvm_emulate_halt(vcpu);
else if (exit_code == SVM_EXIT_NPF)
- return npf_interception(svm);
+ return npf_interception(vcpu);
#endif
- return svm_exit_handlers[exit_code](svm);
+ return svm_exit_handlers[exit_code](vcpu);
}
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
@@ -3395,7 +3319,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
- return svm_invoke_exit_handler(svm, exit_code);
+ return svm_invoke_exit_handler(vcpu, exit_code);
}
static void reload_tss(struct kvm_vcpu *vcpu)
@@ -3406,15 +3330,27 @@ static void reload_tss(struct kvm_vcpu *vcpu)
load_TR_desc();
}
-static void pre_svm_run(struct vcpu_svm *svm)
+static void pre_svm_run(struct kvm_vcpu *vcpu)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If the previous vmrun of the vmcb occurred on a different physical
+ * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
+ * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
+ */
+ if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+ svm->current_vmcb->asid_generation = 0;
+ vmcb_mark_all_dirty(svm->vmcb);
+ svm->current_vmcb->cpu = vcpu->cpu;
+ }
- if (sev_guest(svm->vcpu.kvm))
- return pre_sev_run(svm, svm->vcpu.cpu);
+ if (sev_guest(vcpu->kvm))
+ return pre_sev_run(svm, vcpu->cpu);
/* FIXME: handle wraparound of asid_generation */
- if (svm->asid_generation != sd->asid_generation)
+ if (svm->current_vmcb->asid_generation != sd->asid_generation)
new_asid(svm, sd);
}
@@ -3424,7 +3360,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- if (!sev_es_guest(svm->vcpu.kvm))
+ if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
@@ -3478,7 +3414,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
return false;
ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- (svm->vcpu.arch.hflags & HF_NMI_MASK);
+ (vcpu->arch.hflags & HF_NMI_MASK);
return ret;
}
@@ -3498,9 +3434,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+ return !!(vcpu->arch.hflags & HF_NMI_MASK);
}
static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3508,12 +3442,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
struct vcpu_svm *svm = to_svm(vcpu);
if (masked) {
- svm->vcpu.arch.hflags |= HF_NMI_MASK;
- if (!sev_es_guest(svm->vcpu.kvm))
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
} else {
- svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- if (!sev_es_guest(svm->vcpu.kvm))
+ vcpu->arch.hflags &= ~HF_NMI_MASK;
+ if (!sev_es_guest(vcpu->kvm))
svm_clr_intercept(svm, INTERCEPT_IRET);
}
}
@@ -3526,7 +3460,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (!gif_set(svm))
return true;
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/*
* SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
* bit to determine the state of the IF flag.
@@ -3536,7 +3470,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
} else if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
- ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
+ ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
: !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
return true;
@@ -3595,8 +3529,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
- == HF_NMI_MASK)
+ if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
@@ -3638,7 +3571,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu)
if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
else
- svm->asid_generation--;
+ svm->current_vmcb->asid_generation--;
}
static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3675,8 +3608,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}
-static void svm_complete_interrupts(struct vcpu_svm *svm)
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u8 vector;
int type;
u32 exitintinfo = svm->vmcb->control.exit_int_info;
@@ -3688,28 +3622,28 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
* If we've made progress since setting HF_IRET_MASK, we've
* executed an IRET and can allow NMI injection.
*/
- if ((svm->vcpu.arch.hflags & HF_IRET_MASK) &&
- (sev_es_guest(svm->vcpu.kvm) ||
- kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) {
- svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+ (sev_es_guest(vcpu->kvm) ||
+ kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+ vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
- svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
if (!(exitintinfo & SVM_EXITINTINFO_VALID))
return;
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
switch (type) {
case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = true;
+ vcpu->arch.nmi_injected = true;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
/*
@@ -3725,21 +3659,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
*/
if (kvm_exception_is_soft(vector)) {
if (vector == BP_VECTOR && int3_injected &&
- kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
- kvm_rip_write(&svm->vcpu,
- kvm_rip_read(&svm->vcpu) -
- int3_injected);
+ kvm_is_linear_rip(vcpu, svm->int3_rip))
+ kvm_rip_write(vcpu,
+ kvm_rip_read(vcpu) - int3_injected);
break;
}
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_requeue_exception_e(&svm->vcpu, vector, err);
+ kvm_requeue_exception_e(vcpu, vector, err);
} else
- kvm_requeue_exception(&svm->vcpu, vector);
+ kvm_requeue_exception(vcpu, vector);
break;
case SVM_EXITINTINFO_TYPE_INTR:
- kvm_queue_interrupt(&svm->vcpu, vector, false);
+ kvm_queue_interrupt(vcpu, vector, false);
break;
default:
break;
@@ -3754,7 +3687,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
control->exit_int_info = control->event_inj;
control->exit_int_info_err = control->event_inj_err;
control->event_inj = 0;
- svm_complete_interrupts(svm);
+ svm_complete_interrupts(vcpu);
}
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
@@ -3766,9 +3699,11 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
- struct vcpu_svm *svm)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long vmcb_pa = svm->current_vmcb->pa;
+
/*
* VMENTER enables interrupts (host state), but the kernel state is
* interrupts disabled when this is invoked. Also tell RCU about
@@ -3789,12 +3724,20 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
- if (sev_es_guest(svm->vcpu.kvm)) {
- __svm_sev_es_vcpu_run(svm->vmcb_pa);
+ if (sev_es_guest(vcpu->kvm)) {
+ __svm_sev_es_vcpu_run(vmcb_pa);
} else {
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
- __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+ /*
+ * Use a single vmcb (vmcb01 because it's always valid) for
+ * context switching guest state via VMLOAD/VMSAVE, that way
+ * the state doesn't need to be copied between vmcb01 and
+ * vmcb02 when switching vmcbs for nested virtualization.
+ */
+ vmload(svm->vmcb01.pa);
+ __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
+ vmsave(svm->vmcb01.pa);
vmload(__sme_page_pa(sd->save_area));
}
@@ -3845,7 +3788,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
smp_send_reschedule(vcpu->cpu);
}
- pre_svm_run(svm);
+ pre_svm_run(vcpu);
sync_lapic_to_cr8(vcpu);
@@ -3859,7 +3802,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
svm_set_dr6(svm, vcpu->arch.dr6);
else
svm_set_dr6(svm, DR6_ACTIVE_LOW);
@@ -3875,9 +3818,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* is no need to worry about the conditional branch over the wrmsr
* being speculatively taken.
*/
- x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu, svm);
+ svm_vcpu_enter_exit(vcpu);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -3894,15 +3838,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
+ unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
- if (!sev_es_guest(svm->vcpu.kvm))
+ if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
- x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
- if (!sev_es_guest(svm->vcpu.kvm)) {
+ if (!sev_es_guest(vcpu->kvm)) {
vcpu->arch.cr2 = svm->vmcb->save.cr2;
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
@@ -3910,7 +3856,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
}
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_before_interrupt(&svm->vcpu);
+ kvm_before_interrupt(vcpu);
kvm_load_host_xsave_state(vcpu);
stgi();
@@ -3918,13 +3864,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
/* Any pending NMI will happen here */
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_after_interrupt(&svm->vcpu);
+ kvm_after_interrupt(vcpu);
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
- if (is_guest_mode(&svm->vcpu)) {
- sync_nested_vmcb_control(svm);
+ if (is_guest_mode(vcpu)) {
+ nested_sync_control_from_vmcb02(svm);
svm->nested.nested_run_pending = 0;
}
@@ -3933,7 +3879,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
/* if exit due to PF check for async PF */
if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
- svm->vcpu.arch.apf.host_apf_flags =
+ vcpu->arch.apf.host_apf_flags =
kvm_read_and_reset_apf_flags();
if (npt_enabled) {
@@ -3947,9 +3893,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
*/
if (unlikely(svm->vmcb->control.exit_code ==
SVM_EXIT_EXCP_BASE + MC_VECTOR))
- svm_handle_mce(svm);
+ svm_handle_mce(vcpu);
- svm_complete_interrupts(svm);
+ svm_complete_interrupts(vcpu);
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
@@ -3957,21 +3903,26 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
return svm_exit_handlers_fastpath(vcpu);
}
-static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level)
{
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long cr3;
- cr3 = __sme_set(root);
if (npt_enabled) {
- svm->vmcb->control.nested_cr3 = cr3;
+ svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
/* Loading L2's CR3 is handled by enter_svm_guest_mode. */
if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
return;
cr3 = vcpu->arch.cr3;
+ } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+ cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+ } else {
+ /* PCID in the guest should be impossible with a 32-bit MMU. */
+ WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+ cr3 = root_hpa;
}
svm->vmcb->save.cr3 = cr3;
@@ -4048,7 +3999,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
/* Update nrips enabled cache */
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
- guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+ guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
/* Check again if INVPCID interception if required */
svm_check_invpcid(svm);
@@ -4060,24 +4011,50 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
}
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ /*
+ * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
+ * is exposed to the guest, disable AVIC.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+ kvm_request_apicv_update(vcpu->kvm, false,
+ APICV_INHIBIT_REASON_X2APIC);
- /*
- * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
- * is exposed to the guest, disable AVIC.
- */
- if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
- kvm_request_apicv_update(vcpu->kvm, false,
- APICV_INHIBIT_REASON_X2APIC);
+ /*
+ * Currently, AVIC does not work with nested virtualization.
+ * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
+ */
+ if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ kvm_request_apicv_update(vcpu->kvm, false,
+ APICV_INHIBIT_REASON_NESTED);
+ }
- /*
- * Currently, AVIC does not work with nested virtualization.
- * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
- */
- if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
- kvm_request_apicv_update(vcpu->kvm, false,
- APICV_INHIBIT_REASON_NESTED);
+ if (guest_cpuid_is_intel(vcpu)) {
+ /*
+ * We must intercept SYSENTER_EIP and SYSENTER_ESP
+ * accesses because the processor only stores 32 bits.
+ * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+ */
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+ } else {
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+ /* No need to intercept these MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+ }
}
static bool svm_has_wbinvd_exit(void)
@@ -4349,15 +4326,15 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
if (!(saved_efer & EFER_SVME))
return 1;
- if (kvm_vcpu_map(&svm->vcpu,
+ if (kvm_vcpu_map(vcpu,
gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
return 1;
if (svm_allocate_nested(svm))
return 1;
- ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+ kvm_vcpu_unmap(vcpu, &map, true);
}
}
@@ -4612,6 +4589,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region,
+ .vm_copy_enc_context_from = svm_vm_copy_asid_from,
+
.can_emulate_instruction = svm_can_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9806aaebc37f..d6206196eec1 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -28,7 +28,10 @@ static const u32 host_save_user_msrs[] = {
};
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define MAX_DIRECT_ACCESS_MSRS 18
+#define IOPM_SIZE PAGE_SIZE * 3
+#define MSRPM_SIZE PAGE_SIZE * 2
+
+#define MAX_DIRECT_ACCESS_MSRS 20
#define MSRPM_OFFSETS 16
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
@@ -65,6 +68,7 @@ struct kvm_sev_info {
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ struct kvm *enc_context_owner; /* Owner of copied encryption context */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
};
@@ -82,11 +86,19 @@ struct kvm_svm {
struct kvm_vcpu;
+struct kvm_vmcb_info {
+ struct vmcb *ptr;
+ unsigned long pa;
+ int cpu;
+ uint64_t asid_generation;
+};
+
struct svm_nested_state {
- struct vmcb *hsave;
+ struct kvm_vmcb_info vmcb02;
u64 hsave_msr;
u64 vm_cr_msr;
u64 vmcb12_gpa;
+ u64 last_vmcb12_gpa;
/* These are the merged vectors */
u32 *msrpm;
@@ -103,13 +115,14 @@ struct svm_nested_state {
struct vcpu_svm {
struct kvm_vcpu vcpu;
+ /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
struct vmcb *vmcb;
- unsigned long vmcb_pa;
+ struct kvm_vmcb_info vmcb01;
+ struct kvm_vmcb_info *current_vmcb;
struct svm_cpu_data *svm_data;
u32 asid;
- uint64_t asid_generation;
- uint64_t sysenter_esp;
- uint64_t sysenter_eip;
+ u32 sysenter_esp_hi;
+ u32 sysenter_eip_hi;
uint64_t tsc_aux;
u64 msr_decfg;
@@ -240,17 +253,14 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
vmcb->control.clean &= ~(1 << bit);
}
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
{
- return container_of(vcpu, struct vcpu_svm, vcpu);
+ return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
}
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(&svm->vcpu))
- return svm->nested.hsave;
- else
- return svm->vmcb;
+ return container_of(vcpu, struct vcpu_svm, vcpu);
}
static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
@@ -273,7 +283,7 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
if (!sev_es_guest(svm->vcpu.kvm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
@@ -300,7 +310,7 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm)
static inline void clr_dr_intercepts(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb->control.intercepts[INTERCEPT_DR] = 0;
@@ -315,7 +325,7 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
WARN_ON_ONCE(bit >= 32);
vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -325,7 +335,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
WARN_ON_ONCE(bit >= 32);
vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -335,7 +345,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb_set_intercept(&vmcb->control, bit);
@@ -344,7 +354,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb_clr_intercept(&vmcb->control, bit);
@@ -406,7 +416,7 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu);
bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
void svm_set_gif(struct vcpu_svm *svm, bool value);
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
int read, int write);
@@ -438,20 +448,30 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
- struct vmcb *nested_vmcb);
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12);
void svm_leave_nested(struct vcpu_svm *svm);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
-int nested_svm_vmrun(struct vcpu_svm *svm);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+ svm->vmcb->control.exit_code = exit_code;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+ return nested_svm_vmexit(svm);
+}
+
int nested_svm_exit_handled(struct vcpu_svm *svm);
-int nested_svm_check_permissions(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
-void sync_nested_vmcb_control(struct vcpu_svm *svm);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
extern struct kvm_x86_nested_ops svm_nested_ops;
@@ -492,8 +512,8 @@ void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm);
void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm);
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
@@ -562,11 +582,12 @@ int svm_register_enc_region(struct kvm *kvm,
struct kvm_enc_region *range);
int svm_unregister_enc_region(struct kvm *kvm,
struct kvm_enc_region *range);
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
void pre_sev_run(struct vcpu_svm *svm, int cpu);
void __init sev_hardware_setup(void);
void sev_hardware_teardown(void);
void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct vcpu_svm *svm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_create_vcpu(struct vcpu_svm *svm);
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 6feb8c08f45a..4fa17df123cd 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -79,28 +79,10 @@ SYM_FUNC_START(__svm_vcpu_run)
/* Enter guest mode */
sti
-1: vmload %_ASM_AX
- jmp 3f
-2: cmpb $0, kvm_rebooting
- jne 3f
- ud2
- _ASM_EXTABLE(1b, 2b)
-3: vmrun %_ASM_AX
- jmp 5f
-4: cmpb $0, kvm_rebooting
- jne 5f
- ud2
- _ASM_EXTABLE(3b, 4b)
+1: vmrun %_ASM_AX
-5: vmsave %_ASM_AX
- jmp 7f
-6: cmpb $0, kvm_rebooting
- jne 7f
- ud2
- _ASM_EXTABLE(5b, 6b)
-7:
- cli
+2: cli
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -167,6 +149,13 @@ SYM_FUNC_START(__svm_vcpu_run)
#endif
pop %_ASM_BP
ret
+
+3: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+
+ _ASM_EXTABLE(1b, 3b)
+
SYM_FUNC_END(__svm_vcpu_run)
/**
@@ -186,18 +175,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
- /* Enter guest mode */
+ /* Move @vmcb to RAX. */
mov %_ASM_ARG1, %_ASM_AX
+
+ /* Enter guest mode */
sti
1: vmrun %_ASM_AX
- jmp 3f
-2: cmpb $0, kvm_rebooting
- jne 3f
- ud2
- _ASM_EXTABLE(1b, 2b)
-3: cli
+2: cli
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -217,4 +203,11 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
pop %_ASM_BP
ret
+
+3: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+
+ _ASM_EXTABLE(1b, 3b)
+
SYM_FUNC_END(__svm_sev_es_vcpu_run)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bcca0b80e0d0..8b111682fe5c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -11,6 +11,7 @@
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "sgx.h"
#include "trace.h"
#include "vmx.h"
#include "x86.h"
@@ -21,13 +22,7 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
static bool __read_mostly nested_early_check = 0;
module_param(nested_early_check, bool, S_IRUGO);
-#define CC(consistency_check) \
-({ \
- bool failed = (consistency_check); \
- if (failed) \
- trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
- failed; \
-})
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
/*
* Hyper-V requires all of these, so mark them as supported even though
@@ -2306,6 +2301,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+ vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
secondary_exec_controls_set(vmx, exec_control);
}
@@ -3453,6 +3451,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
enum nested_evmptrld_status evmptrld_status;
+ ++vcpu->stat.nested_run;
+
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -3810,9 +3810,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
/*
* Process any exceptions that are not debug traps before MTF.
+ *
+ * Note that only a pending nested run can block a pending exception.
+ * Otherwise an injected NMI/interrupt should either be
+ * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
+ * while delivering the pending exception.
*/
+
if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
- if (block_nested_events)
+ if (vmx->nested.nested_run_pending)
return -EBUSY;
if (!nested_vmx_check_exception(vcpu, &exit_qual))
goto no_vmexit;
@@ -3829,7 +3835,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
if (vcpu->arch.exception.pending) {
- if (block_nested_events)
+ if (vmx->nested.nested_run_pending)
return -EBUSY;
if (!nested_vmx_check_exception(vcpu, &exit_qual))
goto no_vmexit;
@@ -4105,6 +4111,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
{
/* update exit information fields: */
vmcs12->vm_exit_reason = vm_exit_reason;
+ if (to_vmx(vcpu)->exit_reason.enclave_mode)
+ vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
vmcs12->exit_qualification = exit_qualification;
vmcs12->vm_exit_intr_info = exit_intr_info;
@@ -4422,6 +4430,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ /* Similarly, triple faults in L2 should never escape. */
+ WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
/* Service the TLB flush request for L2 before switching to L1. */
@@ -4558,6 +4569,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx->fail = 0;
}
+static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
+{
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+}
+
/*
* Decode the memory-address operand of a vmx instruction, as recorded on an
* exit caused by such an instruction (run by a guest hypervisor).
@@ -5479,16 +5495,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
if (!nested_vmx_check_eptp(vcpu, new_eptp))
return 1;
- kvm_mmu_unload(vcpu);
mmu->ept_ad = accessed_dirty;
mmu->mmu_role.base.ad_disabled = !accessed_dirty;
vmcs12->ept_pointer = new_eptp;
- /*
- * TODO: Check what's the correct approach in case
- * mmu reload fails. Currently, we just let the next
- * reload potentially fail
- */
- kvm_mmu_reload(vcpu);
+
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
}
return 0;
@@ -5705,6 +5716,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
return false;
}
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 encls_leaf;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+ return false;
+
+ encls_leaf = kvm_rax_read(vcpu);
+ if (encls_leaf > 62)
+ encls_leaf = 63;
+ return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12, gpa_t bitmap)
{
@@ -5801,9 +5827,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_VMFUNC:
/* VM functions are emulated through L2->L0 vmexits. */
return true;
- case EXIT_REASON_ENCLS:
- /* SGX is never exposed to L1 */
- return true;
default:
break;
}
@@ -5927,6 +5950,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_TPAUSE:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+ case EXIT_REASON_ENCLS:
+ return nested_vmx_exit_handled_encls(vcpu, vmcs12);
default:
return true;
}
@@ -6502,6 +6527,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
msrs->secondary_ctls_high |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (enable_sgx)
+ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC,
msrs->misc_low,
@@ -6599,6 +6627,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
struct kvm_x86_nested_ops vmx_nested_ops = {
.check_events = vmx_check_nested_events,
.hv_timer_pending = nested_vmx_preemption_timer_pending,
+ .triple_fault = nested_vmx_triple_fault,
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
.get_nested_state_pages = vmx_get_nested_state_pages,
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 197148d76b8f..184418baeb3c 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
PIN_BASED_EXT_INTR_MASK;
}
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
/*
* if fixed0[i] == 1: val[i] must be 1
* if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644
index 000000000000..6693ebdc0770
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -0,0 +1,502 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. */
+
+#include <asm/sgx.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "nested.h"
+#include "sgx.h"
+#include "vmx.h"
+#include "x86.h"
+
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
+
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode. Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+ int size, int alignment, gva_t *gva)
+{
+ struct kvm_segment s;
+ bool fault;
+
+ /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+ *gva = offset;
+ if (!is_long_mode(vcpu)) {
+ vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+ *gva += s.base;
+ }
+
+ if (!IS_ALIGNED(*gva, alignment)) {
+ fault = true;
+ } else if (likely(is_long_mode(vcpu))) {
+ fault = is_noncanonical_address(*gva, vcpu);
+ } else {
+ *gva &= 0xffffffff;
+ fault = (s.unusable) ||
+ (s.type != 2 && s.type != 3) ||
+ (*gva > s.limit) ||
+ ((s.base != 0 || s.limit != 0xffffffff) &&
+ (((u64)*gva + size - 1) > s.limit + 1));
+ }
+ if (fault)
+ kvm_inject_gp(vcpu, 0);
+ return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+ unsigned int size)
+{
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = addr;
+ vcpu->run->internal.data[1] = size;
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+ unsigned int size)
+{
+ if (__copy_from_user(data, (void __user *)hva, size)) {
+ sgx_handle_emulation_failure(vcpu, hva, size);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+ gpa_t *gpa)
+{
+ struct x86_exception ex;
+
+ if (write)
+ *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+ else
+ *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+ if (*gpa == UNMAPPED_GVA) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+ *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+ if (kvm_is_error_hva(*hva)) {
+ sgx_handle_emulation_failure(vcpu, gpa, 1);
+ return -EFAULT;
+ }
+
+ *hva |= gpa & ~PAGE_MASK;
+
+ return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+ struct x86_exception ex;
+
+ /*
+ * A non-EPCM #PF indicates a bad userspace HVA. This *should* check
+ * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+ * but the error code isn't (yet) plumbed through the ENCLS helpers.
+ */
+ if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ /*
+ * If the guest thinks it's running on SGX2 hardware, inject an SGX
+ * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+ * #PF on SGX2). The assumption is that EPCM faults are much more
+ * likely than a bad userspace address.
+ */
+ if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+ memset(&ex, 0, sizeof(ex));
+ ex.vector = PF_VECTOR;
+ ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+ PFERR_SGX_MASK;
+ ex.address = gva;
+ ex.error_code_valid = true;
+ ex.nested_page_fault = false;
+ kvm_inject_page_fault(vcpu, &ex);
+ } else {
+ kvm_inject_gp(vcpu, 0);
+ }
+ return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+ struct sgx_pageinfo *pageinfo,
+ unsigned long secs_hva,
+ gva_t secs_gva)
+{
+ struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+ struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+ u64 attributes, xfrm, size;
+ u32 miscselect;
+ u8 max_size_log2;
+ int trapnr, ret;
+
+ sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+ sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+ if (!sgx_12_0 || !sgx_12_1) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ miscselect = contents->miscselect;
+ attributes = contents->attributes;
+ xfrm = contents->xfrm;
+ size = contents->size;
+
+ /* Enforce restriction of access to the PROVISIONKEY. */
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+ (attributes & SGX_ATTR_PROVISIONKEY)) {
+ if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+ pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+ if ((u32)miscselect & ~sgx_12_0->ebx ||
+ (u32)attributes & ~sgx_12_1->eax ||
+ (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+ (u32)xfrm & ~sgx_12_1->ecx ||
+ (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* Enforce CPUID restriction on max enclave size. */
+ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+ sgx_12_0->edx;
+ if (size >= BIT_ULL(max_size_log2))
+ kvm_inject_gp(vcpu, 0);
+
+ /*
+ * sgx_virt_ecreate() returns:
+ * 1) 0: ECREATE was successful
+ * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the
+ * exception number.
+ * 3) -EINVAL: access_ok() on @secs_hva failed. This should never
+ * happen as KVM checks host addresses at memslot creation.
+ * sgx_virt_ecreate() has already warned in this case.
+ */
+ ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+ if (!ret)
+ return kvm_skip_emulated_instruction(vcpu);
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ gva_t pageinfo_gva, secs_gva;
+ gva_t metadata_gva, contents_gva;
+ gpa_t metadata_gpa, contents_gpa, secs_gpa;
+ unsigned long metadata_hva, contents_hva, secs_hva;
+ struct sgx_pageinfo pageinfo;
+ struct sgx_secs *contents;
+ struct x86_exception ex;
+ int r;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+ return 1;
+
+ /*
+ * Copy the PAGEINFO to local memory, its pointers need to be
+ * translated, i.e. we need to do a deep copy/translate.
+ */
+ r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+ sizeof(pageinfo), &ex);
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return 1;
+ } else if (r != X86EMUL_CONTINUE) {
+ sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+ sizeof(pageinfo));
+ return 0;
+ }
+
+ if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+ sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+ &contents_gva))
+ return 1;
+
+ /*
+ * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+ sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid.
+ */
+ if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+ sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+ return 0;
+
+ /*
+ * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+ * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+ * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+ * enforce restriction of access to the PROVISIONKEY.
+ */
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!contents)
+ return -ENOMEM;
+
+ /* Exit to userspace if copying from a host userspace address fails. */
+ if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+ free_page((unsigned long)contents);
+ return 0;
+ }
+
+ pageinfo.metadata = metadata_hva;
+ pageinfo.contents = (u64)contents;
+
+ r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+ free_page((unsigned long)contents);
+
+ return r;
+}
+
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+ unsigned long sig_hva, secs_hva, token_hva, rflags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gva_t sig_gva, secs_gva, token_gva;
+ gpa_t sig_gpa, secs_gpa, token_gpa;
+ int ret, trapnr;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+ return 1;
+
+ /*
+ * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+ sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid. Note, all structures are aligned and
+ * cannot split pages.
+ */
+ if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+ sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+ return 0;
+
+ ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+ (void __user *)secs_hva,
+ vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ /*
+ * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+ * @token_hva or @secs_hva. This should never happen as KVM checks host
+ * addresses at memslot creation. sgx_virt_einit() has already warned
+ * in this case, so just return.
+ */
+ if (ret < 0)
+ return ret;
+
+ rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+ X86_EFLAGS_AF | X86_EFLAGS_SF |
+ X86_EFLAGS_OF);
+ if (ret)
+ rflags |= X86_EFLAGS_ZF;
+ else
+ rflags &= ~X86_EFLAGS_ZF;
+ vmx_set_rflags(vcpu, rflags);
+
+ kvm_rax_write(vcpu, ret);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+ if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ return false;
+
+ if (leaf >= ECREATE && leaf <= ETRACK)
+ return guest_cpuid_has(vcpu, X86_FEATURE_SGX1);
+
+ if (leaf >= EAUG && leaf <= EMODT)
+ return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+
+ return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+ const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+ return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+ u32 leaf = (u32)kvm_rax_read(vcpu);
+
+ if (!encls_leaf_enabled_in_guest(vcpu, leaf)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ } else if (!sgx_enabled_in_guest_bios(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ } else {
+ if (leaf == ECREATE)
+ return handle_encls_ecreate(vcpu);
+ if (leaf == EINIT)
+ return handle_encls_einit(vcpu);
+ WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+ return 0;
+ }
+ return 1;
+}
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+ /*
+ * Use Intel's default value for Skylake hardware if Launch Control is
+ * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+ * Launch Control is supported and enabled, i.e. mimic the reset value
+ * and let the guest write the MSRs at will. If Launch Control is
+ * supported but disabled, then use the current MSR values as the hash
+ * MSRs exist but are read-only (locked and not writable).
+ */
+ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+ rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+ sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+ sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+ sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+ sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+ } else {
+ /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+ }
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+ sizeof(sgx_pubkey_hash));
+}
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *guest_cpuid;
+ u32 eax, ebx, ecx, edx;
+
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+ guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+ return true;
+
+ return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ /*
+ * There is no software enable bit for SGX that is virtualized by
+ * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+ * guest (either by the host or by the guest's BIOS) but enabled in the
+ * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+ * the expected system behavior for ENCLS.
+ */
+ u64 bitmap = -1ull;
+
+ /* Nothing to do if hardware doesn't support SGX */
+ if (!cpu_has_vmx_encls_vmexit())
+ return;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+ sgx_enabled_in_guest_bios(vcpu)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+ if (sgx_intercept_encls_ecreate(vcpu))
+ bitmap |= (1 << ECREATE);
+ }
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+ bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+ /*
+ * Trap and execute EINIT if launch control is enabled in the
+ * host using the guest's values for launch control MSRs, even
+ * if the guest's values are fixed to hardware default values.
+ * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+ * the MSRs is extraordinarily expensive.
+ */
+ if (boot_cpu_has(X86_FEATURE_SGX_LC))
+ bitmap |= (1 << EINIT);
+
+ if (!vmcs12 && is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+ bitmap |= vmcs12->encls_exiting_bitmap;
+ }
+ vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
new file mode 100644
index 000000000000..a400888b376d
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#include "capabilities.h"
+#include "vmx_ops.h"
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
+#else
+#define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ /* Nothing to do if hardware doesn't support SGX */
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+#endif
+
+#endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index c8e51c004f78..034adb6404dc 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(VMREAD_BITMAP, vmread_bitmap),
FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+ FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 80232daf00ff..13494956d0e9 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -69,7 +69,8 @@ struct __packed vmcs12 {
u64 vm_function_control;
u64 eptp_list_address;
u64 pml_address;
- u64 padding64[3]; /* room for future expansion */
+ u64 encls_exiting_bitmap;
+ u64 padding64[2]; /* room for future expansion */
/*
* To allow migration of L1 (complete with its L2 guests) between
* machines of different natural widths (32 or 64 bit), we cannot have
@@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(vm_function_control, 296);
CHECK_OFFSET(eptp_list_address, 304);
CHECK_OFFSET(pml_address, 312);
+ CHECK_OFFSET(encls_exiting_bitmap, 320);
CHECK_OFFSET(cr0_guest_host_mask, 344);
CHECK_OFFSET(cr4_guest_host_mask, 352);
CHECK_OFFSET(cr0_read_shadow, 360);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 32cf8287d4a7..6501d66167b8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -57,6 +57,7 @@
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "sgx.h"
#include "trace.h"
#include "vmcs.h"
#include "vmcs12.h"
@@ -472,26 +473,6 @@ static const u32 vmx_uret_msrs_list[] = {
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- u64 tmp_eptp = INVALID_PAGE;
- int i;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!VALID_PAGE(tmp_eptp)) {
- tmp_eptp = to_vmx(vcpu)->ept_pointer;
- } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
- to_kvm_vmx(kvm)->ept_pointers_match
- = EPT_POINTERS_MISMATCH;
- return;
- }
- }
-
- to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
-
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
void *data)
{
@@ -501,47 +482,70 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush
range->pages);
}
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
+static inline int hv_remote_flush_root_ept(hpa_t root_ept,
+ struct kvm_tlb_range *range)
{
- u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
-
- /*
- * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
- * of the base of EPT PML4 table, strip off EPT configuration
- * information.
- */
if (range)
- return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
+ return hyperv_flush_guest_mapping_range(root_ept,
kvm_fill_hv_flush_list_func, (void *)range);
else
- return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
+ return hyperv_flush_guest_mapping(root_ept);
}
static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_tlb_range *range)
{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
struct kvm_vcpu *vcpu;
- int ret = 0, i;
+ int ret = 0, i, nr_unique_valid_roots;
+ hpa_t root;
- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ spin_lock(&kvm_vmx->hv_root_ept_lock);
- if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
- check_ept_pointer_match(kvm);
+ if (!VALID_PAGE(kvm_vmx->hv_root_ept)) {
+ nr_unique_valid_roots = 0;
- if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+ /*
+ * Flush all valid roots, and see if all vCPUs have converged
+ * on a common root, in which case future flushes can skip the
+ * loop and flush the common root.
+ */
kvm_for_each_vcpu(i, vcpu, kvm) {
- /* If ept_pointer is invalid pointer, bypass flush request. */
- if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
- ret |= __hv_remote_flush_tlb_with_range(
- kvm, vcpu, range);
+ root = to_vmx(vcpu)->hv_root_ept;
+ if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept)
+ continue;
+
+ /*
+ * Set the tracked root to the first valid root. Keep
+ * this root for the entirety of the loop even if more
+ * roots are encountered as a low effort optimization
+ * to avoid flushing the same (first) root again.
+ */
+ if (++nr_unique_valid_roots == 1)
+ kvm_vmx->hv_root_ept = root;
+
+ if (!ret)
+ ret = hv_remote_flush_root_ept(root, range);
+
+ /*
+ * Stop processing roots if a failure occurred and
+ * multiple valid roots have already been detected.
+ */
+ if (ret && nr_unique_valid_roots > 1)
+ break;
}
+
+ /*
+ * The optimized flush of a single root can't be used if there
+ * are multiple valid roots (obviously).
+ */
+ if (nr_unique_valid_roots > 1)
+ kvm_vmx->hv_root_ept = INVALID_PAGE;
} else {
- ret = __hv_remote_flush_tlb_with_range(kvm,
- kvm_get_vcpu(kvm, 0), range);
+ ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range);
}
- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ spin_unlock(&kvm_vmx->hv_root_ept_lock);
return ret;
}
static int hv_remote_flush_tlb(struct kvm *kvm)
@@ -559,7 +563,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
* evmcs in singe VM shares same assist page.
*/
if (!*p_hv_pa_pg)
- *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
if (!*p_hv_pa_pg)
return -ENOMEM;
@@ -576,6 +580,21 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
#endif /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+ if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ spin_lock(&kvm_vmx->hv_root_ept_lock);
+ to_vmx(vcpu)->hv_root_ept = root_ept;
+ if (root_ept != kvm_vmx->hv_root_ept)
+ kvm_vmx->hv_root_ept = INVALID_PAGE;
+ spin_unlock(&kvm_vmx->hv_root_ept_lock);
+ }
+#endif
+}
+
/*
* Comment's format: document - errata name - stepping - processor name.
* Refer from
@@ -1570,12 +1589,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
{
+ /*
+ * Emulation of instructions in SGX enclaves is impossible as RIP does
+ * not point tthe failing instruction, and even if it did, the code
+ * stream is inaccessible. Inject #UD instead of exiting to userspace
+ * so that guest userspace can't DoS the guest simply by triggering
+ * emulation (enclaves are CPL3 only).
+ */
+ if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+ }
return true;
}
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
+ union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
unsigned long rip, orig_rip;
+ u32 instr_len;
/*
* Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1586,9 +1618,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
* i.e. we end up advancing IP with some random value.
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
- to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+ instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+ /*
+ * Emulating an enclave's instructions isn't supported as KVM
+ * cannot access the enclave's memory or its true RIP, e.g. the
+ * vmcs.GUEST_RIP points at the exit point of the enclave, not
+ * the RIP that actually triggered the VM-Exit. But, because
+ * most instructions that cause VM-Exit will #UD in an enclave,
+ * most instruction-based VM-Exits simply do not occur.
+ *
+ * There are a few exceptions, notably the debug instructions
+ * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+ * and generate #DB/#BP as expected, which KVM might intercept.
+ * But again, the CPU does the dirty work and saves an instr
+ * length of zero so VMMs don't shoot themselves in the foot.
+ * WARN if KVM tries to skip a non-zero length instruction on
+ * a VM-Exit from an enclave.
+ */
+ if (!instr_len)
+ goto rip_updated;
+
+ WARN(exit_reason.enclave_mode,
+ "KVM: skipping instruction after SGX enclave VM-Exit");
+
orig_rip = kvm_rip_read(vcpu);
- rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ rip = orig_rip + instr_len;
#ifdef CONFIG_X86_64
/*
* We need to mask out the high 32 bits of RIP if not in 64-bit
@@ -1604,6 +1660,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
return 0;
}
+rip_updated:
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
@@ -1865,6 +1922,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_FEAT_CTL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ return 1;
+ msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+ [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+ break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
@@ -2158,6 +2222,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
+
+ /* SGX may be enabled/disabled by guest's firmware */
+ vmx_write_encls_bitmap(vcpu, NULL);
+ break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ /*
+ * On real hardware, the LE hash MSRs are writable before
+ * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+ * at which point SGX related bits in IA32_FEATURE_CONTROL
+ * become writable.
+ *
+ * KVM does not emulate SGX activation for simplicity, so
+ * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+ * is unlocked. This is technically not architectural
+ * behavior, but it's close enough.
+ */
+ if (!msr_info->host_initiated &&
+ (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+ ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+ !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+ return 1;
+ vmx->msr_ia32_sgxlepubkeyhash
+ [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!msr_info->host_initiated)
@@ -3088,8 +3175,7 @@ static int vmx_get_max_tdp_level(void)
return 4;
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
- int root_level)
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
u64 eptp = VMX_EPTP_MT_WB;
@@ -3098,13 +3184,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
eptp |= VMX_EPTP_AD_ENABLE_BIT;
- eptp |= (root_hpa & PAGE_MASK);
+ eptp |= root_hpa;
return eptp;
}
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
- int pgd_level)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3112,16 +3198,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, pgd, pgd_level);
+ eptp = construct_eptp(vcpu, root_hpa, root_level);
vmcs_write64(EPT_POINTER, eptp);
- if (kvm_x86_ops.tlb_remote_flush) {
- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- to_vmx(vcpu)->ept_pointer = eptp;
- to_kvm_vmx(kvm)->ept_pointers_match
- = EPT_POINTERS_CHECK;
- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- }
+ hv_track_root_ept(vcpu, root_hpa);
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
@@ -3131,7 +3211,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
update_guest_cr3 = false;
vmx_ept_load_pdptrs(vcpu);
} else {
- guest_cr3 = pgd;
+ guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
}
if (update_guest_cr3)
@@ -4314,15 +4394,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
vmx->secondary_exec_control = exec_control;
}
-static void ept_set_mmio_spte_mask(void)
-{
- /*
- * EPT Misconfigurations can be generated if the value of bits 2:0
- * of an EPT paging-structure entry is 110b (write/execute).
- */
- kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
-}
-
#define VMX_XSS_EXIT_BITMAP 0
/*
@@ -4410,8 +4481,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
- if (cpu_has_vmx_encls_vmexit())
- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+ vmx_write_encls_bitmap(&vmx->vcpu, NULL);
if (vmx_pt_mode_is_host_guest()) {
memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
@@ -5184,17 +5254,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
- /* Treat an INVD instruction as a NOP and just skip it. */
- return kvm_skip_emulated_instruction(vcpu);
-}
-
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
@@ -5203,28 +5262,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
- int err;
-
- err = kvm_rdpmc(vcpu);
- return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
- u64 new_bv = kvm_read_edx_eax(vcpu);
- u32 index = kvm_rcx_read(vcpu);
-
- int err = kvm_set_xcr(vcpu, index, new_bv);
- return kvm_complete_insn_gp(vcpu, err);
-}
-
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
@@ -5384,6 +5421,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+ return 1;
+
/*
* A nested guest cannot optimize MMIO vmexits, because we have an
* nGPA here instead of the required GPA.
@@ -5485,18 +5525,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-static void vmx_enable_tdp(void)
-{
- kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
- enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
- enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK,
- cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
- VMX_EPT_RWX_MASK, 0ull);
-
- ept_set_mmio_spte_mask();
-}
-
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -5516,34 +5544,11 @@ static int handle_pause(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
- return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
- printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
- return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
-}
-
static int handle_monitor_trap(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
- printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
- return handle_nop(vcpu);
-}
-
static int handle_invpcid(struct kvm_vcpu *vcpu)
{
u32 vmx_instruction_info;
@@ -5632,16 +5637,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
/*
- * SGX virtualization is not yet supported. There is no software
- * enable bit for SGX, so we have to trap ENCLS and inject a #UD
- * to prevent the guest from executing ENCLS.
+ * SGX virtualization is disabled. There is no software enable bit for
+ * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+ * the guest from executing ENCLS (when SGX is supported by hardware).
*/
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
+#endif /* CONFIG_X86_SGX_KVM */
static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
{
@@ -5668,10 +5675,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
[EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
[EXIT_REASON_HLT] = kvm_emulate_halt,
- [EXIT_REASON_INVD] = handle_invd,
+ [EXIT_REASON_INVD] = kvm_emulate_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
- [EXIT_REASON_RDPMC] = handle_rdpmc,
- [EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
+ [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
[EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
[EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
[EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
@@ -5685,8 +5692,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_APIC_WRITE] = handle_apic_write,
[EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
- [EXIT_REASON_WBINVD] = handle_wbinvd,
- [EXIT_REASON_XSETBV] = handle_xsetbv,
+ [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
+ [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_GDTR_IDTR] = handle_desc,
@@ -5694,13 +5701,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
- [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
[EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
- [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
[EXIT_REASON_INVEPT] = handle_vmx_instruction,
[EXIT_REASON_INVVPID] = handle_vmx_instruction,
- [EXIT_REASON_RDRAND] = handle_invalid_op,
- [EXIT_REASON_RDSEED] = handle_invalid_op,
+ [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
+ [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
@@ -5787,12 +5794,23 @@ static void vmx_dump_dtsel(char *name, uint32_t limit)
vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
}
-void dump_vmcs(void)
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
+{
+ unsigned int i;
+ struct vmx_msr_entry *e;
+
+ pr_err("MSR %s:\n", name);
+ for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+ pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmentry_ctl, vmexit_ctl;
u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
unsigned long cr4;
- u64 efer;
+ int efer_slot;
if (!dump_invalid_vmcs) {
pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
@@ -5804,7 +5822,6 @@ void dump_vmcs(void)
cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
cr4 = vmcs_readl(GUEST_CR4);
- efer = vmcs_read64(GUEST_IA32_EFER);
secondary_exec_control = 0;
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -5816,9 +5833,7 @@ void dump_vmcs(void)
pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
- if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
- (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
- {
+ if (cpu_has_vmx_ept()) {
pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
@@ -5841,10 +5856,20 @@ void dump_vmcs(void)
vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
- if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
- (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
- efer, vmcs_read64(GUEST_IA32_PAT));
+ efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+ else if (efer_slot >= 0)
+ pr_err("EFER= 0x%016llx (autoload)\n",
+ vmx->msr_autoload.guest.val[efer_slot].value);
+ else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer | (EFER_LMA | EFER_LME));
+ else
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
@@ -5860,6 +5885,10 @@ void dump_vmcs(void)
if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
pr_err("InterruptStatus = %04x\n",
vmcs_read16(GUEST_INTR_STATUS));
+ if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+ if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+ vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
pr_err("*** Host State ***\n");
pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
@@ -5881,14 +5910,16 @@ void dump_vmcs(void)
vmcs_readl(HOST_IA32_SYSENTER_ESP),
vmcs_read32(HOST_IA32_SYSENTER_CS),
vmcs_readl(HOST_IA32_SYSENTER_EIP));
- if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
- vmcs_read64(HOST_IA32_EFER),
- vmcs_read64(HOST_IA32_PAT));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
if (cpu_has_load_perf_global_ctrl() &&
vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+ if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
pr_err("*** Control State ***\n");
pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -5997,7 +6028,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
}
if (exit_reason.failed_vmentry) {
- dump_vmcs();
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason.full;
@@ -6006,7 +6037,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
}
if (unlikely(vmx->fail)) {
- dump_vmcs();
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
@@ -6092,7 +6123,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
unexpected_vmexit:
vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
exit_reason.full);
- dump_vmcs();
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
@@ -6976,6 +7007,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
else
memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
+ vcpu_setup_sgx_lepubkeyhash(vcpu);
+
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
@@ -6989,8 +7022,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
vmx->pi_desc.nv = POSTED_INTR_VECTOR;
vmx->pi_desc.sn = 1;
- vmx->ept_pointer = INVALID_PAGE;
-
+#if IS_ENABLED(CONFIG_HYPERV)
+ vmx->hv_root_ept = INVALID_PAGE;
+#endif
return 0;
free_vmcs:
@@ -7007,7 +7041,9 @@ free_vpid:
static int vmx_vm_init(struct kvm *kvm)
{
- spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+#if IS_ENABLED(CONFIG_HYPERV)
+ spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock);
+#endif
if (!ple_gap)
kvm->arch.pause_in_guest = true;
@@ -7302,6 +7338,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
set_cr4_guest_host_mask(vmx);
+ vmx_write_encls_bitmap(vcpu, NULL);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ vmx->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_SGX_LC_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &=
+ ~FEAT_CTL_SGX_LC_ENABLED;
+
/* Refresh #PF interception to account for MAXPHYADDR changes. */
vmx_update_exception_bitmap(vcpu);
}
@@ -7322,6 +7371,13 @@ static __init void vmx_set_cpu_caps(void)
if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ if (!enable_sgx) {
+ kvm_cpu_cap_clear(X86_FEATURE_SGX);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ }
+
if (vmx_umip_emulated())
kvm_cpu_cap_set(X86_FEATURE_UMIP);
@@ -7848,7 +7904,8 @@ static __init int hardware_setup(void)
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
- vmx_enable_tdp();
+ kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+ cpu_has_vmx_ept_execute_only());
if (!enable_ept)
ept_lpage_level = 0;
@@ -7909,6 +7966,8 @@ static __init int hardware_setup(void)
if (!enable_ept || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
+ setup_default_sgx_lepubkeyhash();
+
if (nested) {
nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
vmx_capability.ept);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 89da5e1251f1..19fe09fad2fe 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -325,7 +325,12 @@ struct vcpu_vmx {
*/
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
- u64 ept_pointer;
+ /* SGX Launch Control public key hash */
+ u64 msr_ia32_sgxlepubkeyhash[4];
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ u64 hv_root_ept;
+#endif
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
@@ -338,12 +343,6 @@ struct vcpu_vmx {
} shadow_msr_intercept;
};
-enum ept_pointers_status {
- EPT_POINTERS_CHECK = 0,
- EPT_POINTERS_MATCH = 1,
- EPT_POINTERS_MISMATCH = 2
-};
-
struct kvm_vmx {
struct kvm kvm;
@@ -351,8 +350,10 @@ struct kvm_vmx {
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
- enum ept_pointers_status ept_pointers_match;
- spinlock_t ept_pointer_lock;
+#if IS_ENABLED(CONFIG_HYPERV)
+ hpa_t hv_root_ept;
+ spinlock_t hv_root_ept_lock;
+#endif
};
bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -376,8 +377,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
- int root_level);
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
@@ -543,6 +543,6 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
}
-void dump_vmcs(void);
+void dump_vmcs(struct kvm_vcpu *vcpu);
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 692b0c31c9c8..164b64f65a8f 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -37,6 +37,10 @@ static __always_inline void vmcs_check32(unsigned long field)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
"32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "32-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "32-bit accessor invalid for 64-bit high field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
"32-bit accessor invalid for natural width field");
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eca63625aee4..f0d0b6e043ae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -75,6 +75,7 @@
#include <asm/tlbflush.h>
#include <asm/intel_pt.h>
#include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
#include <clocksource/hyperv_timer.h>
#define CREATE_TRACE_POINTS
@@ -245,6 +246,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("l1d_flush", l1d_flush),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+ VCPU_STAT("nested_run", nested_run),
+ VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
+ VCPU_STAT("directed_yield_successful", directed_yield_successful),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -543,8 +547,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
- if (has_error && !is_protmode(vcpu))
- has_error = false;
if (reinject) {
/*
* On vmentry, vcpu->arch.exception.pending is only
@@ -983,14 +985,17 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
return 0;
}
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
{
- if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
- return __kvm_set_xcr(vcpu, index, xcr);
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+ __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
+EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
@@ -1191,20 +1196,21 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data;
- int err;
- err = kvm_pmu_rdpmc(vcpu, ecx, &data);
- if (err)
- return err;
+ if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
kvm_rax_write(vcpu, (u32)data);
kvm_rdx_write(vcpu, data >> 32);
- return err;
+ return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_rdpmc);
+EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -1791,6 +1797,40 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+ pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+ pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
xfer_to_guest_mode_prepare();
@@ -3382,6 +3422,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0;
break;
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+ if (!msr_info->host_initiated)
+ return 1;
+ msr_info->data = 0;
+ break;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -3771,8 +3817,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_X86_USER_SPACE_MSR:
case KVM_CAP_X86_MSR_FILTER:
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE:
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ return KVM_GUESTDBG_VALID_MASK;
#ifdef CONFIG_KVM_XEN
case KVM_CAP_XEN_HVM:
r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
@@ -4675,7 +4727,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
kvm_update_pv_runtime(vcpu);
return 0;
-
default:
return -EINVAL;
}
@@ -5357,6 +5408,28 @@ split_irqchip_unlock:
kvm->arch.bus_lock_detection_enabled = true;
r = 0;
break;
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE: {
+ unsigned long allowed_attributes = 0;
+
+ r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+ if (r)
+ break;
+
+ /* KVM only supports the PROVISIONKEY privileged attribute. */
+ if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+ !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+ kvm->arch.sgx_provisioning_allowed = true;
+ else
+ r = -EINVAL;
+ break;
+ }
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (kvm_x86_ops.vm_copy_enc_context_from)
+ r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
+ return r;
default:
r = -EINVAL;
break;
@@ -6001,6 +6074,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
@@ -6017,6 +6091,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
/* uses this to access any guest's mapped memory without checking CPL */
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
@@ -8045,9 +8120,6 @@ int kvm_arch_init(void *opaque)
if (r)
goto out_free_percpu;
- kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0,
- PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -8207,21 +8279,35 @@ void kvm_apicv_init(struct kvm *kvm, bool enable)
}
EXPORT_SYMBOL_GPL(kvm_apicv_init);
-static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
{
struct kvm_vcpu *target = NULL;
struct kvm_apic_map *map;
+ vcpu->stat.directed_yield_attempted++;
+
rcu_read_lock();
- map = rcu_dereference(kvm->arch.apic_map);
+ map = rcu_dereference(vcpu->kvm->arch.apic_map);
if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
target = map->phys_map[dest_id]->vcpu;
rcu_read_unlock();
- if (target && READ_ONCE(target->ready))
- kvm_vcpu_yield_to(target);
+ if (!target || !READ_ONCE(target->ready))
+ goto no_yield;
+
+ /* Ignore requests to yield to self */
+ if (vcpu == target)
+ goto no_yield;
+
+ if (kvm_vcpu_yield_to(target) <= 0)
+ goto no_yield;
+
+ vcpu->stat.directed_yield_successful++;
+
+no_yield:
+ return;
}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -8268,7 +8354,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
break;
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
- kvm_sched_yield(vcpu->kvm, a1);
+ kvm_sched_yield(vcpu, a1);
ret = 0;
break;
#ifdef CONFIG_X86_64
@@ -8286,7 +8372,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
break;
- kvm_sched_yield(vcpu->kvm, a0);
+ kvm_sched_yield(vcpu, a0);
ret = 0;
break;
default:
@@ -8369,6 +8455,27 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
}
+
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+ return -EIO;
+
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ return 1;
+ }
+
+ return kvm_x86_ops.nested_ops->check_events(vcpu);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ static_call(kvm_x86_queue_exception)(vcpu);
+}
+
static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
{
int r;
@@ -8377,7 +8484,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected) {
- static_call(kvm_x86_queue_exception)(vcpu);
+ kvm_inject_exception(vcpu);
can_inject = false;
}
/*
@@ -8414,7 +8521,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
* from L2 to L1.
*/
if (is_guest_mode(vcpu)) {
- r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ r = kvm_check_nested_events(vcpu);
if (r < 0)
goto busy;
}
@@ -8440,7 +8547,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
}
}
- static_call(kvm_x86_queue_exception)(vcpu);
+ kvm_inject_exception(vcpu);
can_inject = false;
}
@@ -8977,10 +9084,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
- vcpu->mmio_needed = 0;
- r = 0;
- goto out;
+ if (is_guest_mode(vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ } else {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ r = 0;
+ goto out;
+ }
}
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
@@ -9278,7 +9389,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu))
- kvm_x86_ops.nested_ops->check_events(vcpu);
+ kvm_check_nested_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
@@ -11020,6 +11131,14 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
return false;
}
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+ return true;
+
+ return false;
+}
+
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
return vcpu->arch.preempted_in_kernel;
@@ -11541,7 +11660,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
fallthrough;
case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_mmu_unload(vcpu);
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
return kvm_skip_emulated_instruction(vcpu);
default:
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9035e34aa156..5334bf43fdb8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,14 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
+({ \
+ bool failed = (consistency_check); \
+ if (failed) \
+ trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+ failed; \
+})
+
#define KVM_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
#define KVM_DEFAULT_PLE_WINDOW_GROW 2
@@ -48,6 +56,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;