diff options
50 files changed, 2623 insertions, 246 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index a71d91978d9e..806aa0e75afb 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1403,6 +1403,12 @@ Instead, an abort (data abort if the cause of the page-table update was a load or a store, instruction abort if it was an instruction fetch) is injected in the guest. +S390: +^^^^^ + +Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set. +Returns -EINVAL if called on a protected VM. + 4.36 KVM_SET_TSS_ADDR --------------------- @@ -6273,6 +6279,12 @@ state. At VM creation time, all memory is shared, i.e. the PRIVATE attribute is '0' for all gfns. Userspace can control whether memory is shared/private by toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed. +S390: +^^^^^ + +Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set. +Returns -EINVAL if called on a protected VM. + 4.141 KVM_SET_MEMORY_ATTRIBUTES ------------------------------- @@ -6352,6 +6364,61 @@ a single guest_memfd file, but the bound ranges must not overlap). See KVM_SET_USER_MEMORY_REGION2 for additional details. +4.143 KVM_PRE_FAULT_MEMORY +------------------------ + +:Capability: KVM_CAP_PRE_FAULT_MEMORY +:Architectures: none +:Type: vcpu ioctl +:Parameters: struct kvm_pre_fault_memory (in/out) +:Returns: 0 if at least one page is processed, < 0 on error + +Errors: + + ========== =============================================================== + EINVAL The specified `gpa` and `size` were invalid (e.g. not + page aligned, causes an overflow, or size is zero). + ENOENT The specified `gpa` is outside defined memslots. + EINTR An unmasked signal is pending and no page was processed. + EFAULT The parameter address was invalid. + EOPNOTSUPP Mapping memory for a GPA is unsupported by the + hypervisor, and/or for the current vCPU state/mode. + EIO unexpected error conditions (also causes a WARN) + ========== =============================================================== + +:: + + struct kvm_pre_fault_memory { + /* in/out */ + __u64 gpa; + __u64 size; + /* in */ + __u64 flags; + __u64 padding[5]; + }; + +KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory +for the current vCPU state. KVM maps memory as if the vCPU generated a +stage-2 read page fault, e.g. faults in memory as needed, but doesn't break +CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed. + +In some cases, multiple vCPUs might share the page tables. In this +case, the ioctl can be called in parallel. + +When the ioctl returns, the input values are updated to point to the +remaining range. If `size` > 0 on return, the caller can just issue +the ioctl again with the same `struct kvm_map_memory` argument. + +Shadow page tables cannot support this ioctl because they +are indexed by virtual address or nested guest physical address. +Calling this ioctl when the guest is using shadow page tables (for +example because it is running a nested guest with nested page tables) +will fail with `EOPNOTSUPP` even if `KVM_CHECK_EXTENSION` reports +the capability to be present. + +`flags` must currently be zero. + + 5. The kvm_run structure ======================== diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 9677a0714a39..1ddb6a86ce7f 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -466,6 +466,112 @@ issued by the hypervisor to make the guest ready for execution. Returns: 0 on success, -negative on error +18. KVM_SEV_SNP_LAUNCH_START +---------------------------- + +The KVM_SNP_LAUNCH_START command is used for creating the memory encryption +context for the SEV-SNP guest. It must be called prior to issuing +KVM_SEV_SNP_LAUNCH_UPDATE or KVM_SEV_SNP_LAUNCH_FINISH; + +Parameters (in): struct kvm_sev_snp_launch_start + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_snp_launch_start { + __u64 policy; /* Guest policy to use. */ + __u8 gosvw[16]; /* Guest OS visible workarounds. */ + __u16 flags; /* Must be zero. */ + __u8 pad0[6]; + __u64 pad1[4]; + }; + +See SNP_LAUNCH_START in the SEV-SNP specification [snp-fw-abi]_ for further +details on the input parameters in ``struct kvm_sev_snp_launch_start``. + +19. KVM_SEV_SNP_LAUNCH_UPDATE +----------------------------- + +The KVM_SEV_SNP_LAUNCH_UPDATE command is used for loading userspace-provided +data into a guest GPA range, measuring the contents into the SNP guest context +created by KVM_SEV_SNP_LAUNCH_START, and then encrypting/validating that GPA +range so that it will be immediately readable using the encryption key +associated with the guest context once it is booted, after which point it can +attest the measurement associated with its context before unlocking any +secrets. + +It is required that the GPA ranges initialized by this command have had the +KVM_MEMORY_ATTRIBUTE_PRIVATE attribute set in advance. See the documentation +for KVM_SET_MEMORY_ATTRIBUTES for more details on this aspect. + +Upon success, this command is not guaranteed to have processed the entire +range requested. Instead, the ``gfn_start``, ``uaddr``, and ``len`` fields of +``struct kvm_sev_snp_launch_update`` will be updated to correspond to the +remaining range that has yet to be processed. The caller should continue +calling this command until those fields indicate the entire range has been +processed, e.g. ``len`` is 0, ``gfn_start`` is equal to the last GFN in the +range plus 1, and ``uaddr`` is the last byte of the userspace-provided source +buffer address plus 1. In the case where ``type`` is KVM_SEV_SNP_PAGE_TYPE_ZERO, +``uaddr`` will be ignored completely. + +Parameters (in): struct kvm_sev_snp_launch_update + +Returns: 0 on success, < 0 on error, -EAGAIN if caller should retry + +:: + + struct kvm_sev_snp_launch_update { + __u64 gfn_start; /* Guest page number to load/encrypt data into. */ + __u64 uaddr; /* Userspace address of data to be loaded/encrypted. */ + __u64 len; /* 4k-aligned length in bytes to copy into guest memory.*/ + __u8 type; /* The type of the guest pages being initialized. */ + __u8 pad0; + __u16 flags; /* Must be zero. */ + __u32 pad1; + __u64 pad2[4]; + + }; + +where the allowed values for page_type are #define'd as:: + + KVM_SEV_SNP_PAGE_TYPE_NORMAL + KVM_SEV_SNP_PAGE_TYPE_ZERO + KVM_SEV_SNP_PAGE_TYPE_UNMEASURED + KVM_SEV_SNP_PAGE_TYPE_SECRETS + KVM_SEV_SNP_PAGE_TYPE_CPUID + +See the SEV-SNP spec [snp-fw-abi]_ for further details on how each page type is +used/measured. + +20. KVM_SEV_SNP_LAUNCH_FINISH +----------------------------- + +After completion of the SNP guest launch flow, the KVM_SEV_SNP_LAUNCH_FINISH +command can be issued to make the guest ready for execution. + +Parameters (in): struct kvm_sev_snp_launch_finish + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[32]; + __u8 pad0[3]; + __u16 flags; /* Must be zero */ + __u64 pad1[4]; + }; + + +See SNP_LAUNCH_FINISH in the SEV-SNP specification [snp-fw-abi]_ for further +details on the input parameters in ``struct kvm_sev_snp_launch_finish``. + Device attribute API ==================== @@ -497,9 +603,11 @@ References ========== -See [white-paper]_, [api-spec]_, [amd-apm]_ and [kvm-forum]_ for more info. +See [white-paper]_, [api-spec]_, [amd-apm]_, [kvm-forum]_, and [snp-fw-abi]_ +for more info. .. [white-paper] https://developer.amd.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf .. [api-spec] https://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf .. [amd-apm] https://support.amd.com/TechDocs/24593.pdf (section 15.34) .. [kvm-forum] https://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf +.. [snp-fw-abi] https://www.amd.com/system/files/TechDocs/56860.pdf diff --git a/arch/riscv/include/asm/kvm_aia_aplic.h b/arch/riscv/include/asm/kvm_aia_aplic.h deleted file mode 100644 index 6dd1a4809ec1..000000000000 --- a/arch/riscv/include/asm/kvm_aia_aplic.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2021 Western Digital Corporation or its affiliates. - * Copyright (C) 2022 Ventana Micro Systems Inc. - */ -#ifndef __KVM_RISCV_AIA_IMSIC_H -#define __KVM_RISCV_AIA_IMSIC_H - -#include <linux/bitops.h> - -#define APLIC_MAX_IDC BIT(14) -#define APLIC_MAX_SOURCE 1024 - -#define APLIC_DOMAINCFG 0x0000 -#define APLIC_DOMAINCFG_RDONLY 0x80000000 -#define APLIC_DOMAINCFG_IE BIT(8) -#define APLIC_DOMAINCFG_DM BIT(2) -#define APLIC_DOMAINCFG_BE BIT(0) - -#define APLIC_SOURCECFG_BASE 0x0004 -#define APLIC_SOURCECFG_D BIT(10) -#define APLIC_SOURCECFG_CHILDIDX_MASK 0x000003ff -#define APLIC_SOURCECFG_SM_MASK 0x00000007 -#define APLIC_SOURCECFG_SM_INACTIVE 0x0 -#define APLIC_SOURCECFG_SM_DETACH 0x1 -#define APLIC_SOURCECFG_SM_EDGE_RISE 0x4 -#define APLIC_SOURCECFG_SM_EDGE_FALL 0x5 -#define APLIC_SOURCECFG_SM_LEVEL_HIGH 0x6 -#define APLIC_SOURCECFG_SM_LEVEL_LOW 0x7 - -#define APLIC_IRQBITS_PER_REG 32 - -#define APLIC_SETIP_BASE 0x1c00 -#define APLIC_SETIPNUM 0x1cdc - -#define APLIC_CLRIP_BASE 0x1d00 -#define APLIC_CLRIPNUM 0x1ddc - -#define APLIC_SETIE_BASE 0x1e00 -#define APLIC_SETIENUM 0x1edc - -#define APLIC_CLRIE_BASE 0x1f00 -#define APLIC_CLRIENUM 0x1fdc - -#define APLIC_SETIPNUM_LE 0x2000 -#define APLIC_SETIPNUM_BE 0x2004 - -#define APLIC_GENMSI 0x3000 - -#define APLIC_TARGET_BASE 0x3004 -#define APLIC_TARGET_HART_IDX_SHIFT 18 -#define APLIC_TARGET_HART_IDX_MASK 0x3fff -#define APLIC_TARGET_GUEST_IDX_SHIFT 12 -#define APLIC_TARGET_GUEST_IDX_MASK 0x3f -#define APLIC_TARGET_IPRIO_MASK 0xff -#define APLIC_TARGET_EIID_MASK 0x7ff - -#endif diff --git a/arch/riscv/include/asm/kvm_aia_imsic.h b/arch/riscv/include/asm/kvm_aia_imsic.h deleted file mode 100644 index da5881d2bde0..000000000000 --- a/arch/riscv/include/asm/kvm_aia_imsic.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2021 Western Digital Corporation or its affiliates. - * Copyright (C) 2022 Ventana Micro Systems Inc. - */ -#ifndef __KVM_RISCV_AIA_IMSIC_H -#define __KVM_RISCV_AIA_IMSIC_H - -#include <linux/types.h> -#include <asm/csr.h> - -#define IMSIC_MMIO_PAGE_SHIFT 12 -#define IMSIC_MMIO_PAGE_SZ (1UL << IMSIC_MMIO_PAGE_SHIFT) -#define IMSIC_MMIO_PAGE_LE 0x00 -#define IMSIC_MMIO_PAGE_BE 0x04 - -#define IMSIC_MIN_ID 63 -#define IMSIC_MAX_ID 2048 - -#define IMSIC_EIDELIVERY 0x70 - -#define IMSIC_EITHRESHOLD 0x72 - -#define IMSIC_EIP0 0x80 -#define IMSIC_EIP63 0xbf -#define IMSIC_EIPx_BITS 32 - -#define IMSIC_EIE0 0xc0 -#define IMSIC_EIE63 0xff -#define IMSIC_EIEx_BITS 32 - -#define IMSIC_FIRST IMSIC_EIDELIVERY -#define IMSIC_LAST IMSIC_EIE63 - -#define IMSIC_MMIO_SETIPNUM_LE 0x00 -#define IMSIC_MMIO_SETIPNUM_BE 0x04 - -#endif diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index 0f0a9d11bb5f..2967d305c442 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -10,12 +10,12 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/irq.h> +#include <linux/irqchip/riscv-imsic.h> #include <linux/irqdomain.h> #include <linux/kvm_host.h> #include <linux/percpu.h> #include <linux/spinlock.h> #include <asm/cpufeature.h> -#include <asm/kvm_aia_imsic.h> struct aia_hgei_control { raw_spinlock_t lock; @@ -394,6 +394,8 @@ int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner, { int ret = -ENOENT; unsigned long flags; + const struct imsic_global_config *gc; + const struct imsic_local_config *lc; struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu); if (!kvm_riscv_aia_available() || !hgctrl) @@ -409,11 +411,14 @@ int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner, raw_spin_unlock_irqrestore(&hgctrl->lock, flags); - /* TODO: To be updated later by AIA IMSIC HW guest file support */ - if (hgei_va) - *hgei_va = NULL; - if (hgei_pa) - *hgei_pa = 0; + gc = imsic_get_global_config(); + lc = (gc) ? per_cpu_ptr(gc->local, cpu) : NULL; + if (lc && ret > 0) { + if (hgei_va) + *hgei_va = lc->msi_va + (ret * IMSIC_MMIO_PAGE_SZ); + if (hgei_pa) + *hgei_pa = lc->msi_pa + (ret * IMSIC_MMIO_PAGE_SZ); + } return ret; } @@ -605,9 +610,11 @@ void kvm_riscv_aia_disable(void) int kvm_riscv_aia_init(void) { int rc; + const struct imsic_global_config *gc; if (!riscv_isa_extension_available(NULL, SxAIA)) return -ENODEV; + gc = imsic_get_global_config(); /* Figure-out number of bits in HGEIE */ csr_write(CSR_HGEIE, -1UL); @@ -619,17 +626,17 @@ int kvm_riscv_aia_init(void) /* * Number of usable HGEI lines should be minimum of per-HART * IMSIC guest files and number of bits in HGEIE - * - * TODO: To be updated later by AIA IMSIC HW guest file support */ - kvm_riscv_aia_nr_hgei = 0; + if (gc) + kvm_riscv_aia_nr_hgei = min((ulong)kvm_riscv_aia_nr_hgei, + BIT(gc->guest_index_bits) - 1); + else + kvm_riscv_aia_nr_hgei = 0; - /* - * Find number of guest MSI IDs - * - * TODO: To be updated later by AIA IMSIC HW guest file support - */ + /* Find number of guest MSI IDs */ kvm_riscv_aia_max_ids = IMSIC_MAX_ID; + if (gc && kvm_riscv_aia_nr_hgei) + kvm_riscv_aia_max_ids = gc->nr_guest_ids + 1; /* Initialize guest external interrupt line management */ rc = aia_hgei_init(); diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c index b467ba5ed910..da6ff1bade0d 100644 --- a/arch/riscv/kvm/aia_aplic.c +++ b/arch/riscv/kvm/aia_aplic.c @@ -7,12 +7,12 @@ * Anup Patel <apatel@ventanamicro.com> */ +#include <linux/irqchip/riscv-aplic.h> #include <linux/kvm_host.h> #include <linux/math.h> #include <linux/spinlock.h> #include <linux/swab.h> #include <kvm/iodev.h> -#include <asm/kvm_aia_aplic.h> struct aplic_irq { raw_spinlock_t lock; diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index 5cd407c6a8e4..39cd26af5a69 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -8,9 +8,9 @@ */ #include <linux/bits.h> +#include <linux/irqchip/riscv-imsic.h> #include <linux/kvm_host.h> #include <linux/uaccess.h> -#include <asm/kvm_aia_imsic.h> static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) { diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index e808723a85f1..0a1e859323b4 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -9,13 +9,13 @@ #include <linux/atomic.h> #include <linux/bitmap.h> +#include <linux/irqchip/riscv-imsic.h> #include <linux/kvm_host.h> #include <linux/math.h> #include <linux/spinlock.h> #include <linux/swab.h> #include <kvm/iodev.h> #include <asm/csr.h> -#include <asm/kvm_aia_imsic.h> #define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64)) diff --git a/arch/riscv/kvm/trace.h b/arch/riscv/kvm/trace.h new file mode 100644 index 000000000000..3d54175d805c --- /dev/null +++ b/arch/riscv/kvm/trace.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tracepoints for RISC-V KVM + * + * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd. + * + */ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +TRACE_EVENT(kvm_entry, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + + TP_STRUCT__entry( + __field(unsigned long, pc) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.guest_context.sepc; + ), + + TP_printk("PC: 0x016%lx", __entry->pc) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(struct kvm_cpu_trap *trap), + TP_ARGS(trap), + + TP_STRUCT__entry( + __field(unsigned long, sepc) + __field(unsigned long, scause) + __field(unsigned long, stval) + __field(unsigned long, htval) + __field(unsigned long, htinst) + ), + + TP_fast_assign( + __entry->sepc = trap->sepc; + __entry->scause = trap->scause; + __entry->stval = trap->stval; + __entry->htval = trap->htval; + __entry->htinst = trap->htinst; + ), + + TP_printk("SEPC:0x%lx, SCAUSE:0x%lx, STVAL:0x%lx, HTVAL:0x%lx, HTINST:0x%lx", + __entry->sepc, + __entry->scause, + __entry->stval, + __entry->htval, + __entry->htinst) +); + +#endif /* _TRACE_RSICV_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 17e21df36cc1..bc15df3119c3 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -21,6 +21,9 @@ #include <asm/cacheflush.h> #include <asm/kvm_vcpu_vector.h> +#define CREATE_TRACE_POINTS +#include "trace.h" + const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, ecall_exit_stat), @@ -831,6 +834,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ kvm_riscv_local_tlb_sanitize(vcpu); + trace_kvm_entry(vcpu); + guest_timing_enter_irqoff(); kvm_riscv_vcpu_enter_exit(vcpu); @@ -869,6 +874,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) local_irq_enable(); + trace_kvm_exit(&trap); + preempt_enable(); kvm_vcpu_srcu_read_lock(vcpu); diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index 5761f95abb60..fa98e5c024b2 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -185,6 +185,8 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case EXC_INST_ILLEGAL: case EXC_LOAD_MISALIGNED: case EXC_STORE_MISALIGNED: + case EXC_LOAD_ACCESS: + case EXC_STORE_ACCESS: if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) { kvm_riscv_vcpu_trap_redirect(vcpu, trap); ret = 1; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 9281063636a7..d8bbcac89ac3 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -15,7 +15,6 @@ #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kvm_types.h> -#include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/seqlock.h> #include <linux/module.h> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 54b5b2565df8..4a74effe6870 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5749,6 +5749,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, { gpa_t size; + if (kvm_is_ucontrol(kvm)) + return -EINVAL; + /* When we are protected, we should not change the memory slots */ if (kvm_s390_pv_get_handle(kvm)) return -EINVAL; diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index c9ecae830634..54deafd0d698 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1304,10 +1304,24 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (rc == -EAGAIN) rc = 0; - if (rc || scb_s->icptcode || signal_pending(current) || + + /* + * Exit the loop if the guest needs to process the intercept + */ + if (rc || scb_s->icptcode) + break; + + /* + * Exit the loop if the host needs to process an intercept, + * but rewind the PSW to re-enter SIE once that's completed + * instead of passing a "no action" intercept to the guest. + */ + if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || - kvm_s390_vcpu_sie_inhibited(vcpu)) + kvm_s390_vcpu_sie_inhibited(vcpu)) { + kvm_s390_rewind_psw(vcpu, 4); break; + } cond_resched(); } @@ -1426,8 +1440,10 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || - kvm_s390_vcpu_sie_inhibited(vcpu)) + kvm_s390_vcpu_sie_inhibited(vcpu)) { + kvm_s390_rewind_psw(vcpu, 4); return 0; + } vsie_page = get_vsie_page(vcpu->kvm, scb_addr); if (IS_ERR(vsie_page)) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 5187fcf4b610..566d19b02483 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -139,6 +139,9 @@ KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); KVM_X86_OP_OPTIONAL(get_untagged_addr) KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) +KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) +KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level) +KVM_X86_OP_OPTIONAL(gmem_invalidate) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8ca74e7678f..9bb2e164c523 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -121,6 +121,7 @@ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -1812,6 +1813,9 @@ struct kvm_x86_ops { gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); + int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); + void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); + int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); }; struct kvm_x86_nested_ops { @@ -1939,6 +1943,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 5a8246dd532f..8647cc05e2f4 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -59,6 +59,14 @@ #define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12 #define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0) +/* Preferred GHCB GPA Request */ +#define GHCB_MSR_PREF_GPA_REQ 0x010 +#define GHCB_MSR_GPA_VALUE_POS 12 +#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0) + +#define GHCB_MSR_PREF_GPA_RESP 0x011 +#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff + /* GHCB GPA Register */ #define GHCB_MSR_REG_GPA_REQ 0x012 #define GHCB_MSR_REG_GPA_REQ_VAL(v) \ @@ -93,11 +101,17 @@ enum psc_op { /* GHCBData[11:0] */ \ GHCB_MSR_PSC_REQ) +#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12) +#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52) + #define GHCB_MSR_PSC_RESP 0x015 #define GHCB_MSR_PSC_RESP_VAL(val) \ /* GHCBData[63:32] */ \ (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) +/* Set highest bit as a generic error response */ +#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP) + /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 @@ -115,8 +129,19 @@ enum psc_op { * The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which * is a local stack variable in set_pages_state(). Do not increase this value * without evaluating the impact to stack usage. + * + * Use VMGEXIT_PSC_MAX_COUNT in cases where the actual GHCB-defined max value + * is needed, such as when processing GHCB requests on the hypervisor side. */ #define VMGEXIT_PSC_MAX_ENTRY 64 +#define VMGEXIT_PSC_MAX_COUNT 253 + +#define VMGEXIT_PSC_ERROR_GENERIC (0x100UL << 32) +#define VMGEXIT_PSC_ERROR_INVALID_HDR ((1UL << 32) | 1) +#define VMGEXIT_PSC_ERROR_INVALID_ENTRY ((1UL << 32) | 2) + +#define VMGEXIT_PSC_OP_PRIVATE 1 +#define VMGEXIT_PSC_OP_SHARED 2 struct psc_hdr { u16 cur_entry; diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ca20cc4e5826..1936f37e3371 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -91,6 +91,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); /* RMUPDATE detected 4K page and 2MB page overlap. */ #define RMPUPDATE_FAIL_OVERLAP 4 +/* PSMASH failed due to concurrent access by another CPU */ +#define PSMASH_FAIL_INUSE 3 + /* RMP page size */ #define RMP_PG_SIZE_4K 0 #define RMP_PG_SIZE_2M 1 diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 728c98175b9c..f0dea3750ca9 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -285,7 +285,14 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) -#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) +#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) +#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) +#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) +#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) + +#define SVM_SEV_FEAT_INT_INJ_MODES \ + (SVM_SEV_FEAT_RESTRICTED_INJECTION | \ + SVM_SEV_FEAT_ALTERNATE_INJECTION) struct vmcb_seg { u16 selector; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 9fae1b73b529..988b5204d636 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -697,6 +697,11 @@ enum sev_cmd_id { /* Second time is the charm; improved versions of the above ioctls. */ KVM_SEV_INIT2, + /* SNP-specific commands */ + KVM_SEV_SNP_LAUNCH_START = 100, + KVM_SEV_SNP_LAUNCH_UPDATE, + KVM_SEV_SNP_LAUNCH_FINISH, + KVM_SEV_NR_MAX, }; @@ -824,6 +829,48 @@ struct kvm_sev_receive_update_data { __u32 pad2; }; +struct kvm_sev_snp_launch_start { + __u64 policy; + __u8 gosvw[16]; + __u16 flags; + __u8 pad0[6]; + __u64 pad1[4]; +}; + +/* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 +#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 +#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 +#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 +#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 + +struct kvm_sev_snp_launch_update { + __u64 gfn_start; + __u64 uaddr; + __u64 len; + __u8 type; + __u8 pad0; + __u16 flags; + __u32 pad1; + __u64 pad2[4]; +}; + +#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 +#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 +#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 + +struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; + __u8 pad0[3]; + __u16 flags; + __u64 pad1[4]; +}; + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) @@ -874,5 +921,6 @@ struct kvm_hyperv_eventfd { #define KVM_X86_SW_PROTECTED_VM 1 #define KVM_X86_SEV_VM 2 #define KVM_X86_SEV_ES_VM 3 +#define KVM_X86_SNP_VM 4 #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fec95a770270..4287a8071a3a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -44,6 +44,7 @@ config KVM select KVM_VFIO select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_PRE_FAULT_MEMORY select KVM_WERROR if WERROR help Support hosting fully virtualized guest machines using hardware @@ -139,6 +140,9 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM + select KVM_GENERIC_PRIVATE_MEM + select HAVE_KVM_GMEM_PREPARE + select HAVE_KVM_GMEM_INVALIDATE help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2e454316f2a2..dc80e72e4848 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -253,8 +253,6 @@ static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm) return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm)); } -void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); - int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8d74bdef68c1..4e0e9963066f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3308,7 +3308,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, return RET_PF_CONTINUE; } -static bool page_fault_can_be_fast(struct kvm_page_fault *fault) +static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault) { /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only @@ -3320,6 +3320,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) return false; /* + * For hardware-protected VMs, certain conditions like attempting to + * perform a write to a page which is not in the state that the guest + * expects it to be in can result in a nested/extended #PF. In this + * case, the below code might misconstrue this situation as being the + * result of a write-protected access, and treat it as a spurious case + * rather than taking any action to satisfy the real source of the #PF + * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the + * guest spinning on a #PF indefinitely, so don't attempt the fast path + * in this case. + * + * Note that the kvm_mem_is_private() check might race with an + * attribute update, but this will either result in the guest spinning + * on RET_PF_SPURIOUS until the update completes, or an actual spurious + * case might go down the slow path. Either case will resolve itself. + */ + if (kvm->arch.has_private_mem && + fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) + return false; + + /* * #PF can be fast if: * * 1. The shadow page table entry is not present and A/D bits are @@ -3419,7 +3439,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) u64 *sptep; uint retry_count = 0; - if (!page_fault_can_be_fast(fault)) + if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3428,7 +3448,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) u64 new_spte; if (tdp_mmu_enabled) - sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); @@ -3438,7 +3458,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * available as the vCPU holds a reference to its root(s). */ if (WARN_ON_ONCE(!sptep)) - spte = REMOVED_SPTE; + spte = FROZEN_SPTE; if (!is_shadow_present_pte(spte)) break; @@ -4271,7 +4291,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL); + r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, + true, NULL, NULL); + + /* + * Account fixed page faults, otherwise they'll never be counted, but + * ignore stats for all other return times. Page-ready "faults" aren't + * truly spurious and never trigger emulation + */ + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; } static inline u8 kvm_max_level_for_order(int order) @@ -4291,6 +4320,25 @@ static inline u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } +static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + u8 max_level, int gmem_order) +{ + u8 req_max_level; + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + max_level = min(kvm_max_level_for_order(gmem_order), max_level); + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + req_max_level = static_call(kvm_x86_private_max_mapping_level)(kvm, pfn); + if (req_max_level) + max_level = min(max_level, req_max_level); + + return req_max_level; +} + static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { @@ -4308,9 +4356,9 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, return r; } - fault->max_level = min(kvm_max_level_for_order(max_order), - fault->max_level); fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); + fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, + fault->max_level, max_order); return RET_PF_CONTINUE; } @@ -4661,6 +4709,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } +static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, + u8 *level) +{ + int r; + + /* + * Restrict to TDP page fault, since that's the only case where the MMU + * is indexed by GPA. + */ + if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) + return -EOPNOTSUPP; + + do { + if (signal_pending(current)) + return -EINTR; + cond_resched(); + r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); + } while (r == RET_PF_RETRY); + + if (r < 0) + return r; + + switch (r) { + case RET_PF_FIXED: + case RET_PF_SPURIOUS: + return 0; + + case RET_PF_EMULATE: + return -ENOENT; + + case RET_PF_RETRY: + case RET_PF_CONTINUE: + case RET_PF_INVALID: + default: + WARN_ONCE(1, "could not fix page fault during prefault"); + return -EIO; + } +} + +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK; + u8 level = PG_LEVEL_4K; + u64 end; + int r; + + /* + * reload is efficient when called repeatedly, so we can do it on + * every iteration. + */ + kvm_mmu_reload(vcpu); + + if (kvm_arch_has_private_mem(vcpu->kvm) && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + + /* + * Shadow paging uses GVA for kvm page fault, so restrict to + * two-dimensional paging. + */ + r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); + if (r < 0) + return r; + + /* + * If the mapping that covers range->gpa can use a huge page, it + * may start below it or end after range->gpa + range->size. + */ + end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); + return min(range->size, end - range->gpa); +} + static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; @@ -5886,14 +6007,24 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err } if (r == RET_PF_INVALID) { + vcpu->stat.pf_taken++; + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, - &emulation_type); + &emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } if (r < 0) return r; + + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; + else if (r == RET_PF_EMULATE) + vcpu->stat.pf_emulate++; + else if (r == RET_PF_SPURIOUS) + vcpu->stat.pf_spurious++; + if (r != RET_PF_EMULATE) return 1; @@ -6787,6 +6918,7 @@ restart: return need_tlb_flush; } +EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ce2fcd19ba6b..1721d97743e9 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -288,7 +288,8 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, } static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u64 err, bool prefetch, int *emulation_type) + u64 err, bool prefetch, + int *emulation_type, u8 *level) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -318,14 +319,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); } - /* - * Async #PF "faults", a.k.a. prefetch faults, are not faults from the - * guest perspective and have already been counted at the time of the - * original fault. - */ - if (!prefetch) - vcpu->stat.pf_taken++; - if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) r = kvm_tdp_page_fault(vcpu, &fault); else @@ -344,20 +337,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; + if (level) + *level = fault.goal_level; - /* - * Similar to above, prefetch faults aren't truly spurious, and the - * async #PF path doesn't do emulation. Do count faults that are fixed - * by the async #PF handler though, otherwise they'll never be counted. - */ - if (r == RET_PF_FIXED) - vcpu->stat.pf_fixed++; - else if (prefetch) - ; - else if (r == RET_PF_EMULATE) - vcpu->stat.pf_emulate++; - else if (r == RET_PF_SPURIOUS) - vcpu->stat.pf_spurious++; return r; } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index a5e014d7bc62..59cac37615b6 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -383,7 +383,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || - WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) + WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value)) mmio_value = 0; if (!mmio_value) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 52fa004a1fbc..ef793c459b05 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -202,7 +202,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; /* * If a thread running without exclusive control of the MMU lock must perform a - * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a + * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * @@ -212,14 +212,14 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * * Only used by the TDP MMU. */ -#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) +#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) /* Removed SPTEs must not be misconstrued as shadow present PTEs. */ -static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); +static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); -static inline bool is_removed_spte(u64 spte) +static inline bool is_frozen_spte(u64 spte) { - return spte == REMOVED_SPTE; + return spte == FROZEN_SPTE; } /* Get an SPTE's index into its parent's page table (and the spt array). */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 36539c1b36cd..ff27e1eadd54 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -365,8 +365,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * value to the removed SPTE value. */ for (;;) { - old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); - if (!is_removed_spte(old_spte)) + old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); + if (!is_frozen_spte(old_spte)) break; cpu_relax(); } @@ -397,11 +397,11 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * No retry is needed in the atomic update path as the * sole concern is dropping a Dirty bit, i.e. no other * task can zap/remove the SPTE as mmu_lock is held for - * write. Marking the SPTE as a removed SPTE is not + * write. Marking the SPTE as a frozen SPTE is not * strictly necessary for the same reason, but using - * the remove SPTE value keeps the shared/exclusive + * the frozen SPTE value keeps the shared/exclusive * paths consistent and allows the handle_changed_spte() - * call below to hardcode the new value to REMOVED_SPTE. + * call below to hardcode the new value to FROZEN_SPTE. * * Note, even though dropping a Dirty bit is the only * scenario where a non-atomic update could result in a @@ -413,10 +413,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * it here. */ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, - REMOVED_SPTE, level); + FROZEN_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_spte, REMOVED_SPTE, level, shared); + old_spte, FROZEN_SPTE, level, shared); } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -490,19 +490,19 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (!was_present && !is_present) { /* - * If this change does not involve a MMIO SPTE or removed SPTE, + * If this change does not involve a MMIO SPTE or frozen SPTE, * it is unexpected. Log the change, though it should not * impact the guest since both the former and current SPTEs * are nonpresent. */ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && !is_mmio_spte(kvm, new_spte) && - !is_removed_spte(new_spte))) + !is_frozen_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" "are MMIO SPTEs, or the new SPTE is\n" - "a temporary removed SPTE.\n" + "a temporary frozen SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; @@ -530,7 +530,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } -static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) +static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, + u64 new_spte) { u64 *sptep = rcu_dereference(iter->sptep); @@ -540,7 +541,7 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) * and pre-checking before inserting a new SPTE is advantageous as it * avoids unnecessary work. */ - WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); + WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and @@ -572,9 +573,9 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) * no side-effects other than setting iter->old_spte to the last * known value of the spte. */ -static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { int ret; @@ -590,8 +591,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, return 0; } -static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) +static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) { int ret; @@ -603,26 +604,26 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * in its place before the TLBs are flushed. * * Delay processing of the zapped SPTE until after TLBs are flushed and - * the REMOVED_SPTE is replaced (see below). + * the FROZEN_SPTE is replaced (see below). */ - ret = __tdp_mmu_set_spte_atomic(iter, REMOVED_SPTE); + ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE); if (ret) return ret; kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); /* - * No other thread can overwrite the removed SPTE as they must either + * No other thread can overwrite the frozen SPTE as they must either * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not - * overwrite the special removed SPTE value. Use the raw write helper to + * overwrite the special frozen SPTE value. Use the raw write helper to * avoid an unnecessary check on volatile bits. */ __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE); /* * Process the zapped SPTE after flushing TLBs, and after replacing - * REMOVED_SPTE with 0. This minimizes the amount of time vCPUs are - * blocked by the REMOVED_SPTE and reduces contention on the child + * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are + * blocked by the FROZEN_SPTE and reduces contention on the child * SPTEs. */ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, @@ -652,12 +653,12 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, /* * No thread should be using this function to set SPTEs to or from the - * temporary removed SPTE value. + * temporary frozen SPTE value. * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic * should be used. If operating under the MMU lock in write mode, the - * use of the removed SPTE should not be necessary. + * use of the frozen SPTE should not be necessary. */ - WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); + WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte)); old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); @@ -1126,7 +1127,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * If SPTE has been frozen by another thread, just give up and * retry, avoiding unnecessary page table allocation and free. */ - if (is_removed_spte(iter.old_spte)) + if (is_frozen_spte(iter.old_spte)) goto retry; if (iter.level == fault->goal_level) @@ -1801,12 +1802,11 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, * * WARNING: This function is only intended to be called during fast_page_fault. */ -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; - gfn_t gfn = addr >> PAGE_SHIFT; tdp_ptep_t sptep = NULL; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 58b55e61bd33..1b74e058a81c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -64,7 +64,7 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void) int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 95095a233a45..df8818759698 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -25,6 +25,7 @@ #include <asm/fpu/xcr.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> +#include <asm/sev.h> #include "mmu.h" #include "x86.h" @@ -37,7 +38,7 @@ #define GHCB_VERSION_DEFAULT 2ULL #define GHCB_VERSION_MIN 1ULL -#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP +#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) /* enable/disable SEV support */ static bool sev_enabled = true; @@ -47,6 +48,10 @@ module_param_named(sev, sev_enabled, bool, 0444); static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); +/* enable/disable SEV-SNP support */ +static bool sev_snp_enabled = true; +module_param_named(sev_snp, sev_snp_enabled, bool, 0444); + /* enable/disable SEV-ES DebugSwap support */ static bool sev_es_debug_swap_enabled = true; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); @@ -56,6 +61,23 @@ static u64 sev_supported_vmsa_features; #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + +#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) + +#define INITIAL_VMSA_GPA 0xFFFFFFFFF000 + static u8 sev_enc_bit; static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); @@ -66,6 +88,8 @@ static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; +static int snp_decommission_context(struct kvm *kvm); + struct enc_region { struct list_head list; unsigned long npages; @@ -92,12 +116,17 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) down_write(&sev_deactivate_lock); wbinvd_on_all_cpus(); - ret = sev_guest_df_flush(&error); + + if (sev_snp_enabled) + ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); + else + ret = sev_guest_df_flush(&error); up_write(&sev_deactivate_lock); if (ret) - pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", + sev_snp_enabled ? "-SNP" : "", ret, error); return ret; } @@ -233,6 +262,53 @@ static void sev_decommission(unsigned int handle) sev_guest_decommission(&decommission, NULL); } +/* + * Transition a page to hypervisor-owned/shared state in the RMP table. This + * should not fail under normal conditions, but leak the page should that + * happen since it will no longer be usable by the host due to RMP protections. + */ +static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) +{ + if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { + snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); + return -EIO; + } + + return 0; +} + +/* + * Certain page-states, such as Pre-Guest and Firmware pages (as documented + * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be + * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE + * unless they are reclaimed first. + * + * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they + * might not be usable by the host due to being set as immutable or still + * being associated with a guest ASID. + * + * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be + * converted back to shared, as the page is no longer usable due to RMP + * protections, and it's infeasible for the guest to continue on. + */ +static int snp_page_reclaim(struct kvm *kvm, u64 pfn) +{ + struct sev_data_snp_page_reclaim data = {0}; + int fw_err, rc; + + data.paddr = __sme_set(pfn << PAGE_SHIFT); + rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); + if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { + snp_leak_pages(pfn, 1); + return -EIO; + } + + if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) + return -EIO; + + return rc; +} + static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) { struct sev_data_deactivate deactivate; @@ -288,6 +364,9 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (sev->es_active && !sev->ghcb_version) sev->ghcb_version = GHCB_VERSION_DEFAULT; + if (vm_type == KVM_X86_SNP_VM) + sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; + ret = sev_asid_new(sev); if (ret) goto e_no_asid; @@ -348,7 +427,8 @@ static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EINVAL; if (kvm->arch.vm_type != KVM_X86_SEV_VM && - kvm->arch.vm_type != KVM_X86_SEV_ES_VM) + kvm->arch.vm_type != KVM_X86_SEV_ES_VM && + kvm->arch.vm_type != KVM_X86_SNP_VM) return -EINVAL; if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) @@ -1999,6 +2079,410 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val) } } +/* + * The guest context contains all the information, keys and metadata + * associated with the guest that the firmware tracks to implement SEV + * and SNP features. The firmware stores the guest context in hypervisor + * provide page via the SNP_GCTX_CREATE command. + */ +static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct sev_data_snp_addr data = {}; + void *context; + int rc; + + /* Allocate memory for context page */ + context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); + if (!context) + return NULL; + + data.address = __psp_pa(context); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); + if (rc) { + pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", + rc, argp->error); + snp_free_firmware_page(context); + return NULL; + } + + return context; +} + +static int snp_bind_asid(struct kvm *kvm, int *error) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_activate data = {0}; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.asid = sev_get_asid(kvm); + return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); +} + +static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_launch_start start = {0}; + struct kvm_sev_snp_launch_start params; + int rc; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + /* Don't allow userspace to allocate memory for more than 1 SNP context. */ + if (sev->snp_context) + return -EINVAL; + + sev->snp_context = snp_context_create(kvm, argp); + if (!sev->snp_context) + return -ENOTTY; + + if (params.flags) + return -EINVAL; + + if (params.policy & ~SNP_POLICY_MASK_VALID) + return -EINVAL; + + /* Check for policy bits that must be set */ + if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) || + !(params.policy & SNP_POLICY_MASK_SMT)) + return -EINVAL; + + if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) + return -EINVAL; + + start.gctx_paddr = __psp_pa(sev->snp_context); + start.policy = params.policy; + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); + if (rc) { + pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", + __func__, rc); + goto e_free_context; + } + + sev->fd = argp->sev_fd; + rc = snp_bind_asid(kvm, &argp->error); + if (rc) { + pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", + __func__, rc); + goto e_free_context; + } + + return 0; + +e_free_context: + snp_decommission_context(kvm); + + return rc; +} + +struct sev_gmem_populate_args { + __u8 type; + int sev_fd; + int fw_error; +}; + +static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn, + void __user *src, int order, void *opaque) +{ + struct sev_gmem_populate_args *sev_populate_args = opaque; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + int n_private = 0, ret, i; + int npages = (1 << order); + gfn_t gfn; + + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) + return -EINVAL; + + for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { + struct sev_data_snp_launch_update fw_args = {0}; + bool assigned; + int level; + + if (!kvm_mem_is_private(kvm, gfn)) { + pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n", + __func__, gfn); + ret = -EINVAL; + goto err; + } + + ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); + if (ret || assigned) { + pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", + __func__, gfn, ret, assigned); + ret = -EINVAL; + goto err; + } + + if (src) { + void *vaddr = kmap_local_pfn(pfn + i); + + ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE); + if (ret) + goto err; + kunmap_local(vaddr); + } + + ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K, + sev_get_asid(kvm), true); + if (ret) + goto err; + + n_private++; + + fw_args.gctx_paddr = __psp_pa(sev->snp_context); + fw_args.address = __sme_set(pfn_to_hpa(pfn + i)); + fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); + fw_args.page_type = sev_populate_args->type; + + ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &fw_args, &sev_populate_args->fw_error); + if (ret) + goto fw_err; + } + + return 0; + +fw_err: + /* + * If the firmware command failed handle the reclaim and cleanup of that + * PFN specially vs. prior pages which can be cleaned up below without + * needing to reclaim in advance. + * + * Additionally, when invalid CPUID function entries are detected, + * firmware writes the expected values into the page and leaves it + * unencrypted so it can be used for debugging and error-reporting. + * + * Copy this page back into the source buffer so userspace can use this + * information to provide information on which CPUID leaves/fields + * failed CPUID validation. + */ + if (!snp_page_reclaim(kvm, pfn + i) && + sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && + sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { + void *vaddr = kmap_local_pfn(pfn + i); + + if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE)) + pr_debug("Failed to write CPUID page back to userspace\n"); + + kunmap_local(vaddr); + } + + /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */ + n_private--; + +err: + pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", + __func__, ret, sev_populate_args->fw_error, n_private); + for (i = 0; i < n_private; i++) + kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K); + + return ret; +} + +static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_gmem_populate_args sev_populate_args = {0}; + struct kvm_sev_snp_launch_update params; + struct kvm_memory_slot *memslot; + long npages, count; + void __user *src; + int ret = 0; + + if (!sev_snp_guest(kvm) || !sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, + params.gfn_start, params.len, params.type, params.flags); + + if (!PAGE_ALIGNED(params.len) || params.flags || + (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && + params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && + params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && + params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && + params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) + return -EINVAL; + + npages = params.len / PAGE_SIZE; + + /* + * For each GFN that's being prepared as part of the initial guest + * state, the following pre-conditions are verified: + * + * 1) The backing memslot is a valid private memslot. + * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES + * beforehand. + * 3) The PFN of the guest_memfd has not already been set to private + * in the RMP table. + * + * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page + * faults if there's a race between a fault and an attribute update via + * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized + * here. However, kvm->slots_lock guards against both this as well as + * concurrent memslot updates occurring while these checks are being + * performed, so use that here to make it easier to reason about the + * initial expected state and better guard against unexpected + * situations. + */ + mutex_lock(&kvm->slots_lock); + + memslot = gfn_to_memslot(kvm, params.gfn_start); + if (!kvm_slot_can_be_private(memslot)) { + ret = -EINVAL; + goto out; + } + + sev_populate_args.sev_fd = argp->sev_fd; + sev_populate_args.type = params.type; + src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); + + count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, + sev_gmem_post_populate, &sev_populate_args); + if (count < 0) { + argp->error = sev_populate_args.fw_error; + pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", + __func__, count, argp->error); + ret = -EIO; + } else { + params.gfn_start += count; + params.len -= count * PAGE_SIZE; + if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) + params.uaddr += count * PAGE_SIZE; + + ret = 0; + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) + ret = -EFAULT; + } + +out: + mutex_unlock(&kvm->slots_lock); + + return ret; +} + +static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_launch_update data = {}; + struct kvm_vcpu *vcpu; + unsigned long i; + int ret; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.page_type = SNP_PAGE_TYPE_VMSA; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vcpu_svm *svm = to_svm(vcpu); + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + ret = sev_es_sync_vmsa(svm); + if (ret) + return ret; + + /* Transition the VMSA page to a firmware state. */ + ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); + if (ret) + return ret; + + /* Issue the SNP command to encrypt the VMSA */ + data.address = __sme_pa(svm->sev_es.vmsa); + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &data, &argp->error); + if (ret) { + snp_page_reclaim(kvm, pfn); + + return ret; + } + + svm->vcpu.arch.guest_state_protected = true; + /* + * SEV-ES (and thus SNP) guest mandates LBR Virtualization to + * be _always_ ON. Enable it only after setting + * guest_state_protected because KVM_SET_MSRS allows dynamic + * toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); + } + + return 0; +} + +static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_snp_launch_finish params; + struct sev_data_snp_launch_finish *data; + void *id_block = NULL, *id_auth = NULL; + int ret; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (!sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + if (params.flags) + return -EINVAL; + + /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ + ret = snp_launch_update_vmsa(kvm, argp); + if (ret) + return ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + if (params.id_block_en) { + id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); + if (IS_ERR(id_block)) { + ret = PTR_ERR(id_block); + goto e_free; + } + + data->id_block_en = 1; + data->id_block_paddr = __sme_pa(id_block); + + id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); + if (IS_ERR(id_auth)) { + ret = PTR_ERR(id_auth); + goto e_free_id_block; + } + + data->id_auth_paddr = __sme_pa(id_auth); + + if (params.auth_key_en) + data->auth_key_en = 1; + } + + data->vcek_disabled = params.vcek_disabled; + + memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); + data->gctx_paddr = __psp_pa(sev->snp_context); + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); + + kfree(id_auth); + +e_free_id_block: + kfree(id_block); + +e_free: + kfree(data); + + return ret; +} + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -2022,6 +2506,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) goto out; } + /* + * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only + * allow the use of SNP-specific commands. + */ + if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) { + r = -EPERM; + goto out; + } + switch (sev_cmd.id) { case KVM_SEV_ES_INIT: if (!sev_es_enabled) { @@ -2086,6 +2579,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_RECEIVE_FINISH: r = sev_receive_finish(kvm, &sev_cmd); break; + case KVM_SEV_SNP_LAUNCH_START: + r = snp_launch_start(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_UPDATE: + r = snp_launch_update(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_FINISH: + r = snp_launch_finish(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -2281,6 +2783,31 @@ e_source_fput: return ret; } +static int snp_decommission_context(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_addr data = {}; + int ret; + + /* If context is not created then do nothing */ + if (!sev->snp_context) + return 0; + + /* Do the decommision, which will unbind the ASID from the SNP context */ + data.address = __sme_pa(sev->snp_context); + down_write(&sev_deactivate_lock); + ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); + up_write(&sev_deactivate_lock); + + if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) + return ret; + + snp_free_firmware_page(sev->snp_context); + sev->snp_context = NULL; + + return 0; +} + void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -2322,7 +2849,17 @@ void sev_vm_destroy(struct kvm *kvm) } } - sev_unbind_asid(kvm, sev->handle); + if (sev_snp_guest(kvm)) { + /* + * Decomission handles unbinding of the ASID. If it fails for + * some unexpected reason, just leak the ASID. + */ + if (snp_decommission_context(kvm)) + return; + } else { + sev_unbind_asid(kvm, sev->handle); + } + sev_asid_free(sev); } @@ -2336,11 +2873,16 @@ void __init sev_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SEV_ES); kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); } + if (sev_snp_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); + } } void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -2427,6 +2969,7 @@ void __init sev_hardware_setup(void) sev_es_asid_count = min_sev_asid - 1; WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; + sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: if (boot_cpu_has(X86_FEATURE_SEV)) @@ -2439,9 +2982,15 @@ out: pr_info("SEV-ES %s (ASIDs %u - %u)\n", sev_es_supported ? "enabled" : "disabled", min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + if (boot_cpu_has(X86_FEATURE_SEV_SNP)) + pr_info("SEV-SNP %s (ASIDs %u - %u)\n", + sev_snp_supported ? "enabled" : "disabled", + min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; + sev_snp_enabled = sev_snp_supported; + if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) sev_es_debug_swap_enabled = false; @@ -2520,7 +3069,13 @@ do_wbinvd: void sev_guest_memory_reclaimed(struct kvm *kvm) { - if (!sev_guest(kvm)) + /* + * With SNP+gmem, private/encrypted memory is unreachable via the + * hva-based mmu notifiers, so these events are only actually + * pertaining to shared pages where there is no need to perform + * the WBINVD to flush associated caches. + */ + if (!sev_guest(kvm) || sev_snp_guest(kvm)) return; wbinvd_on_all_cpus(); @@ -2535,11 +3090,24 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); + /* + * If it's an SNP guest, then the VMSA was marked in the RMP table as + * a guest-owned page. Transition the page to hypervisor state before + * releasing it back to the system. + */ + if (sev_snp_guest(vcpu->kvm)) { + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) + goto skip_vmsa_free; + } + if (vcpu->arch.guest_state_protected) sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); __free_page(virt_to_page(svm->sev_es.vmsa)); +skip_vmsa_free: if (svm->sev_es.ghcb_sa_free) kvfree(svm->sev_es.ghcb_sa); } @@ -2735,6 +3303,13 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; + case SVM_VMGEXIT_AP_CREATION: + if (!sev_snp_guest(vcpu->kvm)) + goto vmgexit_err; + if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) + if (!kvm_ghcb_rax_is_valid(svm)) + goto vmgexit_err; + break; case SVM_VMGEXIT_NMI_COMPLETE: case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: @@ -2742,6 +3317,10 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_HV_FEATURES: case SVM_VMGEXIT_TERM_REQUEST: break; + case SVM_VMGEXIT_PSC: + if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) + goto vmgexit_err; + break; default: reason = GHCB_ERR_INVALID_EVENT; goto vmgexit_err; @@ -2929,6 +3508,437 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) svm->vmcb->control.ghcb_gpa = value; } +static int snp_rmptable_psmash(kvm_pfn_t pfn) +{ + int ret; + + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); + + /* + * PSMASH_FAIL_INUSE indicates another processor is modifying the + * entry, so retry until that's no longer the case. + */ + do { + ret = psmash(pfn); + } while (ret == PSMASH_FAIL_INUSE); + + return ret; +} + +static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (vcpu->run->hypercall.ret) + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + else + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); + + return 1; /* resume guest */ +} + +static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) +{ + u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); + u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = gpa; + vcpu->run->hypercall.args[1] = 1; + vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + + vcpu->arch.complete_userspace_io = snp_complete_psc_msr; + + return 0; /* forward request to userspace */ +} + +struct psc_buffer { + struct psc_hdr hdr; + struct psc_entry entries[]; +} __packed; + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc); + +static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) +{ + svm->sev_es.psc_inflight = 0; + svm->sev_es.psc_idx = 0; + svm->sev_es.psc_2m = false; + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret); +} + +static void __snp_complete_one_psc(struct vcpu_svm *svm) +{ + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + struct psc_entry *entries = psc->entries; + struct psc_hdr *hdr = &psc->hdr; + __u16 idx; + + /* + * Everything in-flight has been processed successfully. Update the + * corresponding entries in the guest's PSC buffer and zero out the + * count of in-flight PSC entries. + */ + for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; + svm->sev_es.psc_inflight--, idx++) { + struct psc_entry *entry = &entries[idx]; + + entry->cur_page = entry->pagesize ? 512 : 1; + } + + hdr->cur_entry = idx; +} + +static int snp_complete_one_psc(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + + if (vcpu->run->hypercall.ret) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; /* resume guest */ + } + + __snp_complete_one_psc(svm); + + /* Handle the next range (if any). */ + return snp_begin_psc(svm, psc); +} + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) +{ + struct psc_entry *entries = psc->entries; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct psc_hdr *hdr = &psc->hdr; + struct psc_entry entry_start; + u16 idx, idx_start, idx_end; + int npages; + bool huge; + u64 gfn; + + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + +next_range: + /* There should be no other PSCs in-flight at this point. */ + if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * The PSC descriptor buffer can be modified by a misbehaved guest after + * validation, so take care to only use validated copies of values used + * for things like array indexing. + */ + idx_start = hdr->cur_entry; + idx_end = hdr->end_entry; + + if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); + return 1; + } + + /* Find the start of the next range which needs processing. */ + for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { + entry_start = entries[idx]; + + gfn = entry_start.gfn; + huge = entry_start.pagesize; + npages = huge ? 512 : 1; + + if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); + return 1; + } + + if (entry_start.cur_page) { + /* + * If this is a partially-completed 2M range, force 4K handling + * for the remaining pages since they're effectively split at + * this point. Subsequent code should ensure this doesn't get + * combined with adjacent PSC entries where 2M handling is still + * possible. + */ + npages -= entry_start.cur_page; + gfn += entry_start.cur_page; + huge = false; + } + + if (npages) + break; + } + + if (idx > idx_end) { + /* Nothing more to process. */ + snp_complete_psc(svm, 0); + return 1; + } + + svm->sev_es.psc_2m = huge; + svm->sev_es.psc_idx = idx; + svm->sev_es.psc_inflight = 1; + + /* + * Find all subsequent PSC entries that contain adjacent GPA + * ranges/operations and can be combined into a single + * KVM_HC_MAP_GPA_RANGE exit. + */ + while (++idx <= idx_end) { + struct psc_entry entry = entries[idx]; + + if (entry.operation != entry_start.operation || + entry.gfn != entry_start.gfn + npages || + entry.cur_page || !!entry.pagesize != huge) + break; + + svm->sev_es.psc_inflight++; + npages += huge ? 512 : 1; + } + + switch (entry_start.operation) { + case VMGEXIT_PSC_OP_PRIVATE: + case VMGEXIT_PSC_OP_SHARED: + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); + vcpu->run->hypercall.args[1] = npages; + vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= entry_start.pagesize + ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M + : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + vcpu->arch.complete_userspace_io = snp_complete_one_psc; + return 0; /* forward request to userspace */ + default: + /* + * Only shared/private PSC operations are currently supported, so if the + * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), + * then consider the entire range completed and avoid exiting to + * userspace. In theory snp_complete_psc() can always be called directly + * at this point to complete the current range and start the next one, + * but that could lead to unexpected levels of recursion. + */ + __snp_complete_one_psc(svm); + goto next_range; + } + + unreachable(); +} + +static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex)); + + /* Mark the vCPU as offline and not runnable */ + vcpu->arch.pv.pv_unhalted = false; + vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + + /* Clear use of the VMSA */ + svm->vmcb->control.vmsa_pa = INVALID_PAGE; + + if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { + gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); + struct kvm_memory_slot *slot; + kvm_pfn_t pfn; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) + return -EINVAL; + + /* + * The new VMSA will be private memory guest memory, so + * retrieve the PFN from the gmem backend. + */ + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL)) + return -EINVAL; + + /* + * From this point forward, the VMSA will always be a + * guest-mapped page rather than the initial one allocated + * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa + * could be free'd and cleaned up here, but that involves + * cleanups like wbinvd_on_all_cpus() which would ideally + * be handled during teardown rather than guest boot. + * Deferring that also allows the existing logic for SEV-ES + * VMSAs to be re-used with minimal SNP-specific changes. + */ + svm->sev_es.snp_has_guest_vmsa = true; + + /* Use the new VMSA */ + svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); + + /* Mark the vCPU as runnable */ + vcpu->arch.pv.pv_unhalted = false; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + + /* + * gmem pages aren't currently migratable, but if this ever + * changes then care should be taken to ensure + * svm->sev_es.vmsa is pinned through some other means. + */ + kvm_release_pfn_clean(pfn); + } + + /* + * When replacing the VMSA during SEV-SNP AP creation, + * mark the VMCB dirty so that full state is always reloaded. + */ + vmcb_mark_all_dirty(svm->vmcb); + + return 0; +} + +/* + * Invoked as part of svm_vcpu_reset() processing of an init event. + */ +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + if (!sev_snp_guest(vcpu->kvm)) + return; + + mutex_lock(&svm->sev_es.snp_vmsa_mutex); + + if (!svm->sev_es.snp_ap_waiting_for_reset) + goto unlock; + + svm->sev_es.snp_ap_waiting_for_reset = false; + + ret = __sev_snp_update_protected_guest_state(vcpu); + if (ret) + vcpu_unimpl(vcpu, "snp: AP state update on init failed\n"); + +unlock: + mutex_unlock(&svm->sev_es.snp_vmsa_mutex); +} + +static int sev_snp_ap_creation(struct vcpu_svm *svm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_vcpu *target_vcpu; + struct vcpu_svm *target_svm; + unsigned int request; + unsigned int apic_id; + bool kick; + int ret; + + request = lower_32_bits(svm->vmcb->control.exit_info_1); + apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); + + /* Validate the APIC ID */ + target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); + if (!target_vcpu) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", + apic_id); + return -EINVAL; + } + + ret = 0; + + target_svm = to_svm(target_vcpu); + + /* + * The target vCPU is valid, so the vCPU will be kicked unless the + * request is for CREATE_ON_INIT. For any errors at this stage, the + * kick will place the vCPU in an non-runnable state. + */ + kick = true; + + mutex_lock(&target_svm->sev_es.snp_vmsa_mutex); + + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + target_svm->sev_es.snp_ap_waiting_for_reset = true; + + /* Interrupt injection mode shouldn't change for AP creation */ + if (request < SVM_VMGEXIT_AP_DESTROY) { + u64 sev_features; + + sev_features = vcpu->arch.regs[VCPU_REGS_RAX]; + sev_features ^= sev->vmsa_features; + + if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n", + vcpu->arch.regs[VCPU_REGS_RAX]); + ret = -EINVAL; + goto out; + } + } + + switch (request) { + case SVM_VMGEXIT_AP_CREATE_ON_INIT: + kick = false; + fallthrough; + case SVM_VMGEXIT_AP_CREATE: + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", + svm->vmcb->control.exit_info_2); + ret = -EINVAL; + goto out; + } + + /* + * Malicious guest can RMPADJUST a large page into VMSA which + * will hit the SNP erratum where the CPU will incorrectly signal + * an RMP violation #PF if a hugepage collides with the RMP entry + * of VMSA page, reject the AP CREATE request if VMSA address from + * guest is 2M aligned. + */ + if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { + vcpu_unimpl(vcpu, + "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", + svm->vmcb->control.exit_info_2); + ret = -EINVAL; + goto out; + } + + target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; + break; + case SVM_VMGEXIT_AP_DESTROY: + break; + default: + vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", + request); + ret = -EINVAL; + break; + } + +out: + if (kick) { + kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); + kvm_vcpu_kick(target_vcpu); + } + + mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex); + + return ret; +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3008,6 +4018,38 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); break; + case GHCB_MSR_PREF_GPA_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_REG_GPA_REQ: { + u64 gfn; + + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + + svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); + + set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + } + case GHCB_MSR_PSC_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + ret = snp_begin_psc_msr(svm, control->ghcb_gpa); + break; case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; @@ -3020,12 +4062,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; - vcpu->run->system_event.ndata = 1; - vcpu->run->system_event.data[0] = control->ghcb_gpa; - - return 0; + goto out_terminate; } default: /* Error, keep GHCB MSR value as-is */ @@ -3036,6 +4073,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) control->ghcb_gpa, ret); return ret; + +out_terminate: + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + + return 0; } int sev_handle_vmgexit(struct kvm_vcpu *vcpu) @@ -3071,6 +4116,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); sev_es_sync_from_ghcb(svm); + + /* SEV-SNP guest requires that the GHCB GPA must be registered */ + if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { + vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); + return -EINVAL; + } + ret = sev_es_validate_vmgexit(svm); if (ret) return ret; @@ -3145,6 +4197,22 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) vcpu->run->system_event.ndata = 1; vcpu->run->system_event.data[0] = control->ghcb_gpa; break; + case SVM_VMGEXIT_PSC: + ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + if (ret) + break; + + ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); + break; + case SVM_VMGEXIT_AP_CREATION: + ret = sev_snp_ap_creation(svm); + if (ret) { + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + } + + ret = 1; + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", @@ -3238,7 +4306,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) * the VMSA will be NULL if this vCPU is the destination for intrahost * migration, and will be copied later. */ - if (svm->sev_es.vmsa) + if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ @@ -3310,6 +4378,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); + + mutex_init(&svm->sev_es.snp_vmsa_mutex); } void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) @@ -3420,3 +4490,271 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) return p; } + +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) +{ + struct kvm_memory_slot *slot; + struct kvm *kvm = vcpu->kvm; + int order, rmp_level, ret; + bool assigned; + kvm_pfn_t pfn; + gfn_t gfn; + + gfn = gpa >> PAGE_SHIFT; + + /* + * The only time RMP faults occur for shared pages is when the guest is + * triggering an RMP fault for an implicit page-state change from + * shared->private. Implicit page-state changes are forwarded to + * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults + * for shared pages should not end up here. + */ + if (!kvm_mem_is_private(kvm, gfn)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", + gpa); + return; + } + + slot = gfn_to_memslot(kvm, gfn); + if (!kvm_slot_can_be_private(slot)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", + gpa); + return; + } + + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order); + if (ret) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", + gpa); + return; + } + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret || !assigned) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", + gpa, pfn, ret); + goto out_no_trace; + } + + /* + * There are 2 cases where a PSMASH may be needed to resolve an #NPF + * with PFERR_GUEST_RMP_BIT set: + * + * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM + * bit set if the guest issues them with a smaller granularity than + * what is indicated by the page-size bit in the 2MB RMP entry for + * the PFN that backs the GPA. + * + * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is + * smaller than what is indicated by the 2MB RMP entry for the PFN + * that backs the GPA. + * + * In both these cases, the corresponding 2M RMP entry needs to + * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already + * split into 4K RMP entries, then this is likely a spurious case which + * can occur when there are concurrent accesses by the guest to a 2MB + * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in + * the process of being PMASH'd into 4K entries. These cases should + * resolve automatically on subsequent accesses, so just ignore them + * here. + */ + if (rmp_level == PG_LEVEL_4K) + goto out; + + ret = snp_rmptable_psmash(pfn); + if (ret) { + /* + * Look it up again. If it's 4K now then the PSMASH may have + * raced with another process and the issue has already resolved + * itself. + */ + if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && + assigned && rmp_level == PG_LEVEL_4K) + goto out; + + pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", + gpa, pfn, ret); + } + + kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); +out: + trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); +out_no_trace: + put_page(pfn_to_page(pfn)); +} + +static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn = start; + + while (pfn < end) { + int ret, rmp_level; + bool assigned; + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret) { + pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", + pfn, start, end, rmp_level, ret); + return false; + } + + if (assigned) { + pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", + __func__, pfn, start, end, rmp_level); + return false; + } + + pfn++; + } + + return true; +} + +static u8 max_level_for_order(int order) +{ + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) +{ + kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + + /* + * If this is a large folio, and the entire 2M range containing the + * PFN is currently shared, then the entire 2M-aligned range can be + * set to private via a single 2M RMP entry. + */ + if (max_level_for_order(order) > PG_LEVEL_4K && + is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) + return true; + + return false; +} + +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + kvm_pfn_t pfn_aligned; + gfn_t gfn_aligned; + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc) { + pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", + gfn, pfn, rc); + return -ENOENT; + } + + if (assigned) { + pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", + __func__, gfn, pfn, max_order, level); + return 0; + } + + if (is_large_rmp_possible(kvm, pfn, max_order)) { + level = PG_LEVEL_2M; + pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); + } else { + level = PG_LEVEL_4K; + pfn_aligned = pfn; + gfn_aligned = gfn; + } + + rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); + if (rc) { + pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", + gfn, pfn, level, rc); + return -EINVAL; + } + + pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", + __func__, gfn, pfn, pfn_aligned, max_order, level); + + return 0; +} + +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return; + + pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); + + for (pfn = start; pfn < end;) { + bool use_2m_update = false; + int rc, rmp_level; + bool assigned; + + rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (rc || !assigned) + goto next_pfn; + + use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && + end >= (pfn + PTRS_PER_PMD) && + rmp_level > PG_LEVEL_4K; + + /* + * If an unaligned PFN corresponds to a 2M region assigned as a + * large page in the RMP table, PSMASH the region into individual + * 4K RMP entries before attempting to convert a 4K sub-page. + */ + if (!use_2m_update && rmp_level > PG_LEVEL_4K) { + /* + * This shouldn't fail, but if it does, report it, but + * still try to update RMP entry to shared and pray this + * was a spurious error that can be addressed later. + */ + rc = snp_rmptable_psmash(pfn); + WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", + pfn, rc); + } + + rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); + if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", + pfn, rc)) + goto next_pfn; + + /* + * SEV-ES avoids host/guest cache coherency issues through + * WBINVD hooks issued via MMU notifiers during run-time, and + * KVM's VM destroy path at shutdown. Those MMU notifier events + * don't cover gmem since there is no requirement to map pages + * to a HVA in order to use them for a running guest. While the + * shutdown path would still likely cover things for SNP guests, + * userspace may also free gmem pages during run-time via + * hole-punching operations on the guest_memfd, so flush the + * cache entries for these pages before free'ing them back to + * the host. + */ + clflush_cache_range(__va(pfn_to_hpa(pfn)), + use_2m_update ? PMD_SIZE : PAGE_SIZE); +next_pfn: + pfn += use_2m_update ? PTRS_PER_PMD : 1; + cond_resched(); + } +} + +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc || !assigned) + return PG_LEVEL_4K; + + return level; +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c95d3900fe56..c58da281f14f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1404,6 +1404,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; + if (init_event) + sev_snp_init_protected_guest_state(vcpu); + init_vmcb(vcpu); if (!init_event) @@ -2050,6 +2053,7 @@ static int pf_interception(struct kvm_vcpu *vcpu) static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + int rc; u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; @@ -2063,11 +2067,19 @@ static int npf_interception(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) error_code &= ~PFERR_SYNTHETIC_MASK; + if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) + error_code |= PFERR_PRIVATE_ACCESS; + trace_kvm_page_fault(vcpu, fault_address, error_code); - return kvm_mmu_page_fault(vcpu, fault_address, error_code, - static_cpu_has(X86_FEATURE_DECODEASSISTS) ? - svm->vmcb->control.insn_bytes : NULL, - svm->vmcb->control.insn_len); + rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, + svm->vmcb->control.insn_len); + + if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) + sev_handle_rmp_fault(vcpu, fault_address, error_code); + + return rc; } static int db_interception(struct kvm_vcpu *vcpu) @@ -4937,8 +4949,11 @@ static int svm_vm_init(struct kvm *kvm) if (type != KVM_X86_DEFAULT_VM && type != KVM_X86_SW_PROTECTED_VM) { - kvm->arch.has_protected_state = (type == KVM_X86_SEV_ES_VM); + kvm->arch.has_protected_state = + (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); to_kvm_sev_info(kvm)->need_init = true; + + kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); } if (!pause_filter_count || !pause_filter_thresh) @@ -5095,6 +5110,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, .alloc_apic_backing_page = svm_alloc_apic_backing_page, + + .gmem_prepare = sev_gmem_prepare, + .gmem_invalidate = sev_gmem_invalidate, + .private_max_mapping_level = sev_private_max_mapping_level, }; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 0f1472690b59..d2397b98bbf0 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -94,6 +94,7 @@ struct kvm_sev_info { struct list_head mirror_entry; /* Use as a list entry of mirrors */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; + void *snp_context; /* SNP guest context page */ }; struct kvm_svm { @@ -209,6 +210,18 @@ struct vcpu_sev_es_state { u32 ghcb_sa_len; bool ghcb_sa_sync; bool ghcb_sa_free; + + /* SNP Page-State-Change buffer entries currently being processed */ + u16 psc_idx; + u16 psc_inflight; + bool psc_2m; + + u64 ghcb_registered_gpa; + + struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */ + gpa_t snp_vmsa_gpa; + bool snp_ap_waiting_for_reset; + bool snp_has_guest_vmsa; }; struct vcpu_svm { @@ -350,6 +363,23 @@ static __always_inline bool sev_es_guest(struct kvm *kvm) #endif } +static __always_inline bool sev_snp_guest(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_AMD_SEV + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) && + !WARN_ON_ONCE(!sev_es_guest(kvm)); +#else + return false; +#endif +} + +static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val) +{ + return svm->sev_es.ghcb_registered_gpa == val; +} + static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -705,6 +735,11 @@ void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); int sev_dev_get_attr(u32 group, u64 attr, u64 *val); extern unsigned int max_sev_asid; +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); #else static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) { return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); @@ -718,6 +753,18 @@ static inline void sev_hardware_unsetup(void) {} static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } #define max_sev_asid 0 +static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} +static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} +static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + return 0; +} +static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} +static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return 0; +} + #endif /* vmenter.S */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e19fed438a67..10ad5d32fcc3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1834,6 +1834,37 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) ); +/* + * Tracepoint for #NPFs due to RMP faults. + */ +TRACE_EVENT(kvm_rmp_fault, + TP_PROTO(struct kvm_vcpu *vcpu, u64 gpa, u64 pfn, u64 error_code, + int rmp_level, int psmash_ret), + TP_ARGS(vcpu, gpa, pfn, error_code, rmp_level, psmash_ret), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, gpa) + __field(u64, pfn) + __field(u64, error_code) + __field(int, rmp_level) + __field(int, psmash_ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gpa = gpa; + __entry->pfn = pfn; + __entry->error_code = error_code; + __entry->rmp_level = rmp_level; + __entry->psmash_ret = psmash_ret; + ), + + TP_printk("vcpu %u gpa %016llx pfn 0x%llx error_code 0x%llx rmp_level %d psmash_ret %d", + __entry->vcpu_id, __entry->gpa, __entry->pfn, + __entry->error_code, __entry->rmp_level, __entry->psmash_ret) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0763a0f72a06..a6968eadd418 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4705,6 +4705,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MEMORY_FAULT_INFO: r = 1; break; + case KVM_CAP_PRE_FAULT_MEMORY: + r = tdp_enabled; + break; case KVM_CAP_EXIT_HYPERCALL: r = KVM_EXIT_HYPERCALL_VALID_MASK; break; @@ -10929,6 +10932,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); + + if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) { + kvm_vcpu_reset(vcpu, true); + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) { + r = 1; + goto out; + } + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || @@ -13136,6 +13147,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_PMI, vcpu)) return true; + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) + return true; + if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || kvm_guest_apic_has_interrupt(vcpu))) @@ -13589,6 +13603,24 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +bool kvm_arch_gmem_prepare_needed(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_SNP_VM; +} + +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) +{ + return static_call(kvm_x86_gmem_prepare)(kvm, pfn, gfn, max_order); +} +#endif + +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + static_call_cond(kvm_x86_gmem_invalidate)(start, end); +} +#endif int kvm_spec_ctrl_test_value(u64 value) { @@ -13974,6 +14006,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); static int __init kvm_x86_init(void) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 692c01e41a18..c3c922bf077f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2441,4 +2441,45 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, } #endif /* CONFIG_KVM_PRIVATE_MEM */ +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); +bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); +#endif + +/** + * kvm_gmem_populate() - Populate/prepare a GPA range with guest data + * + * @kvm: KVM instance + * @gfn: starting GFN to be populated + * @src: userspace-provided buffer containing data to copy into GFN range + * (passed to @post_populate, and incremented on each iteration + * if not NULL) + * @npages: number of pages to copy from userspace-buffer + * @post_populate: callback to issue for each gmem page that backs the GPA + * range + * @opaque: opaque data to pass to @post_populate callback + * + * This is primarily intended for cases where a gmem-backed GPA range needs + * to be initialized with userspace-provided data prior to being mapped into + * the guest as a private page. This should be called with the slots->lock + * held so that caller-enforced invariants regarding the expected memory + * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES. + * + * Returns the number of pages that were populated. + */ +typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *opaque); + +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, + kvm_gmem_populate_cb post_populate, void *opaque); + +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +#endif + +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range); +#endif + #endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 59f1df0cde5a..9e2c3fe2ecbe 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -208,7 +208,8 @@ enum mapping_flags { AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ AS_STABLE_WRITES, /* must wait for writeback before modifying folio contents */ - AS_UNMOVABLE, /* The mapping cannot be moved, ever */ + AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping, + including to move the mapping */ }; /** @@ -309,20 +310,20 @@ static inline void mapping_clear_stable_writes(struct address_space *mapping) clear_bit(AS_STABLE_WRITES, &mapping->flags); } -static inline void mapping_set_unmovable(struct address_space *mapping) +static inline void mapping_set_inaccessible(struct address_space *mapping) { /* - * It's expected unmovable mappings are also unevictable. Compaction + * It's expected inaccessible mappings are also unevictable. Compaction * migrate scanner (isolate_migratepages_block()) relies on this to * reduce page locking. */ set_bit(AS_UNEVICTABLE, &mapping->flags); - set_bit(AS_UNMOVABLE, &mapping->flags); + set_bit(AS_INACCESSIBLE, &mapping->flags); } -static inline bool mapping_unmovable(struct address_space *mapping) +static inline bool mapping_inaccessible(struct address_space *mapping) { - return test_bit(AS_UNMOVABLE, &mapping->flags); + return test_bit(AS_INACCESSIBLE, &mapping->flags); } static inline gfp_t mapping_gfp_mask(struct address_space * mapping) diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 3705c2044fc0..903ddfea8585 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -658,6 +658,7 @@ struct sev_data_snp_launch_update { * @id_auth_paddr: system physical address of ID block authentication structure * @id_block_en: indicates whether ID block is present * @auth_key_en: indicates whether author key is present in authentication structure + * @vcek_disabled: indicates whether use of VCEK is allowed for attestation reports * @rsvd: reserved * @host_data: host-supplied data for guest, not interpreted by firmware */ @@ -667,7 +668,8 @@ struct sev_data_snp_launch_finish { u64 id_auth_paddr; u8 id_block_en:1; u8 auth_key_en:1; - u64 rsvd:62; + u8 vcek_disabled:1; + u64 rsvd:61; u8 host_data[32]; } __packed; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d03842abae57..e5af8c692dc0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -917,6 +917,7 @@ struct kvm_enable_cap { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_PRE_FAULT_MEMORY 236 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd { __u64 reserved[6]; }; +#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) + +struct kvm_pre_fault_memory { + __u64 gpa; + __u64 size; + __u64 flags; + __u64 padding[5]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/mm/compaction.c b/mm/compaction.c index 739b1bf3d637..6cb901b63482 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1179,22 +1179,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) || (mapping && is_unevictable)) { bool migrate_dirty = true; - bool is_unmovable; + bool is_inaccessible; /* * Only folios without mappings or that have * a ->migrate_folio callback are possible to migrate * without blocking. * - * Folios from unmovable mappings are not migratable. + * Folios from inaccessible mappings are not migratable. * * However, we can be racing with truncation, which can * free the mapping that we need to check. Truncation * holds the folio lock until after the folio is removed * from the page so holding it ourselves is sufficient. * - * To avoid locking the folio just to check unmovable, - * assume every unmovable folio is also unevictable, + * To avoid locking the folio just to check inaccessible, + * assume every inaccessible folio is also unevictable, * which is a cheaper test. If our assumption goes * wrong, it's not a correctness bug, just potentially * wasted cycles. @@ -1207,9 +1207,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, migrate_dirty = !mapping || mapping->a_ops->migrate_folio; } - is_unmovable = mapping && mapping_unmovable(mapping); + is_inaccessible = mapping && mapping_inaccessible(mapping); folio_unlock(folio); - if (!migrate_dirty || is_unmovable) + if (!migrate_dirty || is_inaccessible) goto isolate_fail_put; } diff --git a/mm/migrate.c b/mm/migrate.c index 20cb9f5f7446..3fe4a9854f7a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -965,7 +965,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, if (!mapping) rc = migrate_folio(mapping, dst, src, mode); - else if (mapping_unmovable(mapping)) + else if (mapping_inaccessible(mapping)) rc = -EOPNOTSUPP; else if (mapping->a_ops->migrate_folio) /* diff --git a/mm/truncate.c b/mm/truncate.c index e99085bf3d34..581977d2356f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -233,7 +233,8 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) * doing a complex calculation here, and then doing the zeroing * anyway if the page split fails. */ - folio_zero_range(folio, offset, length); + if (!mapping_inaccessible(folio->mapping)) + folio_zero_range(folio, offset, length); if (folio_has_private(folio)) folio_invalidate(folio, offset, length); diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index d03842abae57..e5af8c692dc0 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -917,6 +917,7 @@ struct kvm_enable_cap { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_PRE_FAULT_MEMORY 236 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd { __u64 reserved[6]; }; +#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) + +struct kvm_pre_fault_memory { + __u64 gpa; + __u64 size; + __u64 flags; + __u64 padding[5]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/tools/perf/arch/riscv/Makefile b/tools/perf/arch/riscv/Makefile index a8d25d005207..90c3c476a242 100644 --- a/tools/perf/arch/riscv/Makefile +++ b/tools/perf/arch/riscv/Makefile @@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1 endif PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1 PERF_HAVE_JITDUMP := 1 +HAVE_KVM_STAT_SUPPORT := 1 diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build index 603dbb5ae4dc..d72b04f8d32b 100644 --- a/tools/perf/arch/riscv/util/Build +++ b/tools/perf/arch/riscv/util/Build @@ -1,5 +1,6 @@ perf-y += perf_regs.o perf-y += header.o +perf-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/riscv/util/kvm-stat.c b/tools/perf/arch/riscv/util/kvm-stat.c new file mode 100644 index 000000000000..491aef449d1a --- /dev/null +++ b/tools/perf/arch/riscv/util/kvm-stat.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Arch specific functions for perf kvm stat. + * + * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd. + * + */ +#include <errno.h> +#include <memory.h> +#include "../../../util/evsel.h" +#include "../../../util/kvm-stat.h" +#include "riscv_exception_types.h" +#include "debug.h" + +define_exit_reasons_table(riscv_exit_reasons, kvm_riscv_exception_class); + +const char *vcpu_id_str = "id"; +const char *kvm_exit_reason = "scause"; +const char *kvm_entry_trace = "kvm:kvm_entry"; +const char *kvm_exit_trace = "kvm:kvm_exit"; + +const char *kvm_events_tp[] = { + "kvm:kvm_entry", + "kvm:kvm_exit", + NULL, +}; + +static void event_get_key(struct evsel *evsel, + struct perf_sample *sample, + struct event_key *key) +{ + key->info = 0; + key->key = evsel__intval(evsel, sample, kvm_exit_reason); + key->exit_reasons = riscv_exit_reasons; +} + +static bool event_begin(struct evsel *evsel, + struct perf_sample *sample __maybe_unused, + struct event_key *key __maybe_unused) +{ + return evsel__name_is(evsel, kvm_entry_trace); +} + +static bool event_end(struct evsel *evsel, + struct perf_sample *sample, + struct event_key *key) +{ + if (evsel__name_is(evsel, kvm_exit_trace)) { + event_get_key(evsel, sample, key); + return true; + } + return false; +} + +static struct kvm_events_ops exit_events = { + .is_begin_event = event_begin, + .is_end_event = event_end, + .decode_key = exit_event_decode_key, + .name = "VM-EXIT" +}; + +struct kvm_reg_events_ops kvm_reg_events_ops[] = { + { + .name = "vmexit", + .ops = &exit_events, + }, + { NULL, NULL }, +}; + +const char * const kvm_skip_events[] = { + NULL, +}; + +int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) +{ + kvm->exit_reasons_isa = "riscv64"; + return 0; +} diff --git a/tools/perf/arch/riscv/util/riscv_exception_types.h b/tools/perf/arch/riscv/util/riscv_exception_types.h new file mode 100644 index 000000000000..c49b8fa5e847 --- /dev/null +++ b/tools/perf/arch/riscv/util/riscv_exception_types.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef ARCH_PERF_RISCV_EXCEPTION_TYPES_H +#define ARCH_PERF_RISCV_EXCEPTION_TYPES_H + +#define EXC_INST_MISALIGNED 0 +#define EXC_INST_ACCESS 1 +#define EXC_INST_ILLEGAL 2 +#define EXC_BREAKPOINT 3 +#define EXC_LOAD_MISALIGNED 4 +#define EXC_LOAD_ACCESS 5 +#define EXC_STORE_MISALIGNED 6 +#define EXC_STORE_ACCESS 7 +#define EXC_SYSCALL 8 +#define EXC_HYPERVISOR_SYSCALL 9 +#define EXC_SUPERVISOR_SYSCALL 10 +#define EXC_INST_PAGE_FAULT 12 +#define EXC_LOAD_PAGE_FAULT 13 +#define EXC_STORE_PAGE_FAULT 15 +#define EXC_INST_GUEST_PAGE_FAULT 20 +#define EXC_LOAD_GUEST_PAGE_FAULT 21 +#define EXC_VIRTUAL_INST_FAULT 22 +#define EXC_STORE_GUEST_PAGE_FAULT 23 + +#define EXC(x) {EXC_##x, #x } + +#define kvm_riscv_exception_class \ + EXC(INST_MISALIGNED), EXC(INST_ACCESS), EXC(INST_ILLEGAL), \ + EXC(BREAKPOINT), EXC(LOAD_MISALIGNED), EXC(LOAD_ACCESS), \ + EXC(STORE_MISALIGNED), EXC(STORE_ACCESS), EXC(SYSCALL), \ + EXC(HYPERVISOR_SYSCALL), EXC(SUPERVISOR_SYSCALL), \ + EXC(INST_PAGE_FAULT), EXC(LOAD_PAGE_FAULT), EXC(STORE_PAGE_FAULT), \ + EXC(INST_GUEST_PAGE_FAULT), EXC(LOAD_GUEST_PAGE_FAULT), \ + EXC(VIRTUAL_INST_FAULT), EXC(STORE_GUEST_PAGE_FAULT) + +#endif /* ARCH_PERF_RISCV_EXCEPTION_TYPES_H */ diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ac280dcba996..b073b818be37 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -145,6 +145,7 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test TEST_GEN_PROGS_x86_64 += system_counter_offset_test +TEST_GEN_PROGS_x86_64 += pre_fault_memory_test # Compiled outputs used by test targets TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c new file mode 100644 index 000000000000..0350a8896a2f --- /dev/null +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024, Intel, Inc + * + * Author: + * Isaku Yamahata <isaku.yamahata at gmail.com> + */ +#include <linux/sizes.h> + +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +/* Arbitrarily chosen values */ +#define TEST_SIZE (SZ_2M + PAGE_SIZE) +#define TEST_NPAGES (TEST_SIZE / PAGE_SIZE) +#define TEST_SLOT 10 + +static void guest_code(uint64_t base_gpa) +{ + volatile uint64_t val __used; + int i; + + for (i = 0; i < TEST_NPAGES; i++) { + uint64_t *src = (uint64_t *)(base_gpa + i * PAGE_SIZE); + + val = *src; + } + + GUEST_DONE(); +} + +static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 gpa, u64 size, + u64 left) +{ + struct kvm_pre_fault_memory range = { + .gpa = gpa, + .size = size, + .flags = 0, + }; + u64 prev; + int ret, save_errno; + + do { + prev = range.size; + ret = __vcpu_ioctl(vcpu, KVM_PRE_FAULT_MEMORY, &range); + save_errno = errno; + TEST_ASSERT((range.size < prev) ^ (ret < 0), + "%sexpecting range.size to change on %s", + ret < 0 ? "not " : "", + ret < 0 ? "failure" : "success"); + } while (ret >= 0 ? range.size : save_errno == EINTR); + + TEST_ASSERT(range.size == left, + "Completed with %lld bytes left, expected %" PRId64, + range.size, left); + + if (left == 0) + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, "KVM_PRE_FAULT_MEMORY", ret, vcpu->vm); + else + /* No memory slot causes RET_PF_EMULATE. it results in -ENOENT. */ + __TEST_ASSERT_VM_VCPU_IOCTL(ret && save_errno == ENOENT, + "KVM_PRE_FAULT_MEMORY", ret, vcpu->vm); +} + +static void __test_pre_fault_memory(unsigned long vm_type, bool private) +{ + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = vm_type, + }; + struct kvm_vcpu *vcpu; + struct kvm_run *run; + struct kvm_vm *vm; + struct ucall uc; + + uint64_t guest_test_phys_mem; + uint64_t guest_test_virt_mem; + uint64_t alignment, guest_page_size; + + vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code); + + alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + guest_test_phys_mem = (vm->max_gfn - TEST_NPAGES) * guest_page_size; +#ifdef __s390x__ + alignment = max(0x100000UL, guest_page_size); +#else + alignment = SZ_2M; +#endif + guest_test_phys_mem = align_down(guest_test_phys_mem, alignment); + guest_test_virt_mem = guest_test_phys_mem & ((1ULL << (vm->va_bits - 1)) - 1); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_test_phys_mem, TEST_SLOT, TEST_NPAGES, + private ? KVM_MEM_GUEST_MEMFD : 0); + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, TEST_NPAGES); + + if (private) + vm_mem_set_private(vm, guest_test_phys_mem, TEST_SIZE); + pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, 0); + pre_fault_memory(vcpu, guest_test_phys_mem + SZ_2M, PAGE_SIZE * 2, PAGE_SIZE); + pre_fault_memory(vcpu, guest_test_phys_mem + TEST_SIZE, PAGE_SIZE, PAGE_SIZE); + + vcpu_args_set(vcpu, 1, guest_test_virt_mem); + vcpu_run(vcpu); + + run = vcpu->run; + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Wanted KVM_EXIT_IO, got exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + break; + } + + kvm_vm_free(vm); +} + +static void test_pre_fault_memory(unsigned long vm_type, bool private) +{ + if (vm_type && !(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(vm_type))) { + pr_info("Skipping tests for vm_type 0x%lx\n", vm_type); + return; + } + + __test_pre_fault_memory(vm_type, private); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY)); + + test_pre_fault_memory(0, false); +#ifdef __x86_64__ + test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, false); + test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, true); +#endif + return 0; +} diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 29b73eedfe74..b14e14cdbfb9 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS config KVM_GENERIC_DIRTYLOG_READ_PROTECT bool +config KVM_GENERIC_PRE_FAULT_MEMORY + bool + config KVM_COMPAT def_bool y depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) @@ -109,3 +112,11 @@ config KVM_GENERIC_PRIVATE_MEM select KVM_GENERIC_MEMORY_ATTRIBUTES select KVM_PRIVATE_MEM bool + +config HAVE_KVM_GMEM_PREPARE + bool + depends on KVM_PRIVATE_MEM + +config HAVE_KVM_GMEM_INVALIDATE + bool + depends on KVM_PRIVATE_MEM diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 747fe251e445..1c509c351261 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -13,14 +13,50 @@ struct kvm_gmem { struct list_head entry; }; -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) +{ +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE + struct list_head *gmem_list = &inode->i_mapping->i_private_list; + struct kvm_gmem *gmem; + + list_for_each_entry(gmem, gmem_list, entry) { + struct kvm_memory_slot *slot; + struct kvm *kvm = gmem->kvm; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + int rc; + + if (!kvm_arch_gmem_prepare_needed(kvm)) + continue; + + slot = xa_load(&gmem->bindings, index); + if (!slot) + continue; + + page = folio_file_page(folio, index); + pfn = page_to_pfn(page); + gfn = slot->base_gfn + index - slot->gmem.pgoff; + rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page))); + if (rc) { + pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", + index, gfn, pfn, rc); + return rc; + } + } + +#endif + return 0; +} + +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) { struct folio *folio; /* TODO: Support huge pages. */ folio = filemap_grab_folio(inode->i_mapping, index); - if (IS_ERR_OR_NULL(folio)) - return NULL; + if (IS_ERR(folio)) + return folio; /* * Use the up-to-date flag to track whether or not the memory has been @@ -41,6 +77,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) folio_mark_uptodate(folio); } + if (prepare) { + int r = kvm_gmem_prepare_folio(inode, index, folio); + if (r < 0) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(r); + } + } + /* * Ignore accessed, referenced, and dirty flags. The memory is * unevictable and there is no storage to write back to. @@ -145,9 +190,9 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) break; } - folio = kvm_gmem_get_folio(inode, index); - if (!folio) { - r = -ENOMEM; + folio = kvm_gmem_get_folio(inode, index, true); + if (IS_ERR(folio)) { + r = PTR_ERR(folio); break; } @@ -298,10 +343,24 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol return MF_DELAYED; } +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +static void kvm_gmem_free_folio(struct folio *folio) +{ + struct page *page = folio_page(folio, 0); + kvm_pfn_t pfn = page_to_pfn(page); + int order = folio_order(folio); + + kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); +} +#endif + static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE + .free_folio = kvm_gmem_free_folio, +#endif }; static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, @@ -360,7 +419,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) inode->i_mode |= S_IFREG; inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); - mapping_set_unmovable(inode->i_mapping); + mapping_set_inaccessible(inode->i_mapping); /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); @@ -482,38 +541,34 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) fput(file); } -int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; - struct kvm_gmem *gmem; + struct kvm_gmem *gmem = file->private_data; struct folio *folio; struct page *page; - struct file *file; int r; - file = kvm_gmem_get_file(slot); - if (!file) + if (file != slot->gmem.file) { + WARN_ON_ONCE(slot->gmem.file); return -EFAULT; + } gmem = file->private_data; - - if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) { - r = -EIO; - goto out_fput; + if (xa_load(&gmem->bindings, index) != slot) { + WARN_ON_ONCE(xa_load(&gmem->bindings, index)); + return -EIO; } - folio = kvm_gmem_get_folio(file_inode(file), index); - if (!folio) { - r = -ENOMEM; - goto out_fput; - } + folio = kvm_gmem_get_folio(file_inode(file), index, prepare); + if (IS_ERR(folio)) + return PTR_ERR(folio); if (folio_test_hwpoison(folio)) { folio_unlock(folio); folio_put(folio); - r = -EHWPOISON; - goto out_fput; + return -EHWPOISON; } page = folio_file_page(folio, index); @@ -525,9 +580,78 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, r = 0; folio_unlock(folio); -out_fput: - fput(file); return r; } + +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +{ + struct file *file = kvm_gmem_get_file(slot); + int r; + + if (!file) + return -EFAULT; + + r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true); + fput(file); + return r; +} EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); + +long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, + kvm_gmem_populate_cb post_populate, void *opaque) +{ + struct file *file; + struct kvm_memory_slot *slot; + void __user *p; + + int ret = 0, max_order; + long i; + + lockdep_assert_held(&kvm->slots_lock); + if (npages < 0) + return -EINVAL; + + slot = gfn_to_memslot(kvm, start_gfn); + if (!kvm_slot_can_be_private(slot)) + return -EINVAL; + + file = kvm_gmem_get_file(slot); + if (!file) + return -EFAULT; + + filemap_invalidate_lock(file->f_mapping); + + npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); + for (i = 0; i < npages; i += (1 << max_order)) { + gfn_t gfn = start_gfn + i; + kvm_pfn_t pfn; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); + if (ret) + break; + + if (!IS_ALIGNED(gfn, (1 << max_order)) || + (npages - i) < (1 << max_order)) + max_order = 0; + + p = src ? src + i * PAGE_SIZE : NULL; + ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); + + put_page(pfn_to_page(pfn)); + if (ret) + break; + } + + filemap_invalidate_unlock(file->f_mapping); + + fput(file); + return ret && !i ? ret : i; +} +EXPORT_SYMBOL_GPL(kvm_gmem_populate); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1192942aef91..3282ff98b3a3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4375,6 +4375,52 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) return fd; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + int idx; + long r; + u64 full_size; + + if (range->flags) + return -EINVAL; + + if (!PAGE_ALIGNED(range->gpa) || + !PAGE_ALIGNED(range->size) || + range->gpa + range->size <= range->gpa) + return -EINVAL; + + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + full_size = range->size; + do { + if (signal_pending(current)) { + r = -EINTR; + break; + } + + r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); + if (WARN_ON_ONCE(r == 0 || r == -EIO)) + break; + + if (r < 0) + break; + + range->size -= r; + range->gpa += r; + cond_resched(); + } while (range->size); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu_put(vcpu); + + /* Return success if at least one page was mapped successfully. */ + return full_size == range->size ? r : 0; +} +#endif + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4575,6 +4621,20 @@ out_free1: r = kvm_vcpu_ioctl_get_stats_fd(vcpu); break; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY + case KVM_PRE_FAULT_MEMORY: { + struct kvm_pre_fault_memory range; + + r = -EFAULT; + if (copy_from_user(&range, argp, sizeof(range))) + break; + r = kvm_vcpu_pre_fault_memory(vcpu, &range); + /* Pass back leftover range. */ + if (copy_to_user(argp, &range, sizeof(range))) + r = -EFAULT; + break; + } +#endif default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } |