diff options
Diffstat (limited to 'arch/x86/kvm/svm')
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 691 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 164 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 417 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 793 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 153 | ||||
-rw-r--r-- | arch/x86/kvm/svm/vmenter.S | 6 |
6 files changed, 1218 insertions, 1006 deletions
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 7338879d1c0c..a34c5c3b164e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -18,8 +18,10 @@ #include <linux/hashtable.h> #include <linux/amd-iommu.h> #include <linux/kvm_host.h> +#include <linux/kvm_irqfd.h> #include <asm/irq_remapping.h> +#include <asm/msr.h> #include "trace.h" #include "lapic.h" @@ -28,36 +30,39 @@ #include "svm.h" /* - * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID, - * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry - * if an interrupt can't be delivered, e.g. because the vCPU isn't running. + * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that + * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't + * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index + * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast + * lookup on the index, where as vCPUs whose index doesn't match their ID need + * to walk the entire xarray of vCPUs in the worst case scenario. * - * For the vCPU ID, use however many bits are currently allowed for the max + * For the vCPU index, use however many bits are currently allowed for the max * guest physical APIC ID (limited by the size of the physical ID table), and * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the * size of the GATag is defined by hardware (32 bits), but is an opaque value * as far as hardware is concerned. */ -#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK +#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) -#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) -#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ - ((vcpu_id) & AVIC_VCPU_ID_MASK)) -#define AVIC_GATAG(vm_id, vcpu_id) \ +#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ + ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) +#define AVIC_GATAG(vm_id, vcpu_idx) \ ({ \ - u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \ + u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ \ - WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \ + WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ ga_tag; \ }) -static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u); +static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); static bool force_avic; module_param_unsafe(force_avic, bool, 0444); @@ -74,14 +79,6 @@ static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); bool x2avic_enabled; -/* - * This is a wrapper of struct amd_iommu_ir_data. - */ -struct amd_svm_iommu_ir { - struct list_head node; /* Used by SVM for per-vcpu ir_list */ - void *data; /* Storing pointer to struct amd_ir_data */ -}; - static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -146,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag) struct kvm_svm *kvm_svm; struct kvm_vcpu *vcpu = NULL; u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); + u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); - pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); - trace_kvm_avic_ga_log(vm_id, vcpu_id); + pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); + trace_kvm_avic_ga_log(vm_id, vcpu_idx); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { if (kvm_svm->avic_vm_id != vm_id) continue; - vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); + vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); break; } spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); @@ -179,10 +176,8 @@ void avic_vm_destroy(struct kvm *kvm) if (!enable_apicv) return; - if (kvm_svm->avic_logical_id_table_page) - __free_page(kvm_svm->avic_logical_id_table_page); - if (kvm_svm->avic_physical_id_table_page) - __free_page(kvm_svm->avic_physical_id_table_page); + free_page((unsigned long)kvm_svm->avic_logical_id_table); + free_page((unsigned long)kvm_svm->avic_physical_id_table); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_del(&kvm_svm->hnode); @@ -195,27 +190,19 @@ int avic_vm_init(struct kvm *kvm) int err = -ENOMEM; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); struct kvm_svm *k2; - struct page *p_page; - struct page *l_page; u32 vm_id; if (!enable_apicv) return 0; - /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!p_page) + kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!kvm_svm->avic_physical_id_table) goto free_avic; - kvm_svm->avic_physical_id_table_page = p_page; - - /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!l_page) + kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!kvm_svm->avic_logical_id_table) goto free_avic; - kvm_svm->avic_logical_id_table_page = l_page; - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); again: vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; @@ -241,17 +228,19 @@ free_avic: return err; } +static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) +{ + return __sme_set(__pa(svm->vcpu.arch.apic->regs)); +} + void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) { struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); - phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); - phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); - phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); - vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; - vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; + vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); + vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); + vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; if (kvm_apicv_activated(svm->vcpu.kvm)) avic_activate_vmcb(svm); @@ -259,32 +248,31 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) avic_deactivate_vmcb(svm); } -static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, - unsigned int index) -{ - u64 *avic_physical_id_table; - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - - if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) || - (index > X2AVIC_MAX_PHYSICAL_ID)) - return NULL; - - avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); - - return &avic_physical_id_table[index]; -} - static int avic_init_backing_page(struct kvm_vcpu *vcpu) { - u64 *entry, new_entry; - int id = vcpu->vcpu_id; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); + u32 id = vcpu->vcpu_id; + u64 new_entry; + /* + * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC + * hardware. Immediately clear apicv_active, i.e. don't wait until the + * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as + * avic_vcpu_load() expects to be called if and only if the vCPU has + * fully initialized AVIC. + */ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || - (id > X2AVIC_MAX_PHYSICAL_ID)) - return -EINVAL; + (id > X2AVIC_MAX_PHYSICAL_ID)) { + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); + vcpu->arch.apic->apicv_active = false; + return 0; + } + + BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || + (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); - if (!vcpu->arch.apic->regs) + if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) return -EINVAL; if (kvm_apicv_activated(vcpu->kvm)) { @@ -301,19 +289,21 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return ret; } - svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); + /* Note, fls64() returns the bit position, +1. */ + BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > + fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); /* Setting AVIC backing page address in the phy APIC ID table */ - entry = avic_get_physical_id_entry(vcpu, id); - if (!entry) - return -EINVAL; + new_entry = avic_get_backing_page_address(svm) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + svm->avic_physical_id_entry = new_entry; - new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); - WRITE_ONCE(*entry, new_entry); - - svm->avic_physical_id_cache = entry; + /* + * Initialize the real table, as vCPUs must have a valid entry in order + * for broadcast IPIs to function correctly (broadcast IPIs ignore + * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). + */ + WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); return 0; } @@ -330,7 +320,7 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) int cpu = READ_ONCE(vcpu->cpu); if (cpu != get_cpu()) { - wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); } put_cpu(); @@ -447,7 +437,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (apic_x2apic_mode(source)) avic_logical_id_table = NULL; else - avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); + avic_logical_id_table = kvm_svm->avic_logical_id_table; /* * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical @@ -549,7 +539,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - u32 *logical_apic_id_table; u32 cluster, index; ldr = GET_APIC_LOGICAL_ID(ldr); @@ -570,9 +559,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) return NULL; index += (cluster << 2); - logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); - - return &logical_apic_id_table[index]; + return &kvm_svm->avic_logical_id_table[index]; } static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) @@ -721,6 +708,9 @@ int avic_init_vcpu(struct vcpu_svm *svm) int ret; struct kvm_vcpu *vcpu = &svm->vcpu; + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); + if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) return 0; @@ -728,8 +718,6 @@ int avic_init_vcpu(struct vcpu_svm *svm) if (ret) return ret; - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); svm->dfr_reg = APIC_DFR_FLAT; return ret; @@ -741,316 +729,161 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) { - int ret = 0; + struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; unsigned long flags; - struct amd_svm_iommu_ir *ir; - struct vcpu_svm *svm = to_svm(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. - */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - - if (list_empty(&svm->ir_list)) - goto out; + if (!vcpu) + return; - list_for_each_entry(ir, &svm->ir_list, node) { - if (activate) - ret = amd_iommu_activate_guest_mode(ir->data); - else - ret = amd_iommu_deactivate_guest_mode(ir->data); - if (ret) - break; - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; + spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); + list_del(&irqfd->vcpu_list); + spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); } -static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_vcpu *vcpu, u32 vector) { - unsigned long flags; - struct amd_svm_iommu_ir *cur; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, node) { - if (cur->data != pi->ir_data) - continue; - list_del(&cur->node); - kfree(cur); - break; - } - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -} - -static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - u64 entry; - - if (WARN_ON_ONCE(!pi->ir_data)) - return -EINVAL; - - /** - * In some cases, the existing irte is updated and re-set, - * so we need to check here if it's already been * added - * to the ir_list. - */ - if (pi->prev_ga_tag) { - struct kvm *kvm = svm->vcpu.kvm; - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); - struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - struct vcpu_svm *prev_svm; - - if (!prev_vcpu) { - ret = -EINVAL; - goto out; - } - - prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, pi); - } - - /** - * Allocating new amd_iommu_pi_data, which will get - * add to the per-vcpu ir_list. - */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); - if (!ir) { - ret = -ENOMEM; - goto out; - } - ir->data = pi->ir_data; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - /* - * Update the target pCPU for IOMMU doorbells if the vCPU is running. - * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM - * will update the pCPU info when the vCPU awkened and/or scheduled in. - * See also avic_vcpu_load(). + * If the IRQ was affined to a different vCPU, remove the IRTE metadata + * from the *previous* vCPU's list. */ - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, - true, pi->ir_data); - - list_add(&ir->node, &svm->ir_list); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -out: - return ret; -} + svm_ir_list_del(irqfd); -/* - * Note: - * The HW cannot support posting multicast/broadcast - * interrupts to a vCPU. So, we still use legacy interrupt - * remapping for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - */ -static int -get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct vcpu_data *vcpu_info, struct vcpu_svm **svm) -{ - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu = NULL; - - kvm_set_msi_irq(kvm, e, &irq); - - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", - __func__, irq.vector); - return -1; - } - - pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, - irq.vector); - *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); - vcpu_info->vector = irq.vector; - - return 0; -} - -/* - * avic_pi_update_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - bool enable_remapped_mode = true; - int idx, ret = 0; - - if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) - return 0; - - pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", - __func__, host_irq, guest_irq, set); - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - struct vcpu_data vcpu_info; - struct vcpu_svm *svm = NULL; + if (vcpu) { + /* + * Try to enable guest_mode in IRTE, unless AVIC is inhibited, + * in which case configure the IRTE for legacy mode, but track + * the IRTE metadata so that it can be converted to guest mode + * if AVIC is enabled/uninhibited in the future. + */ + struct amd_iommu_pi_data pi_data = { + .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, + vcpu->vcpu_idx), + .is_guest_mode = kvm_vcpu_apicv_active(vcpu), + .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), + .vector = vector, + }; + struct vcpu_svm *svm = to_svm(vcpu); + u64 entry; + int ret; - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; + /* + * Prevent the vCPU from being scheduled out or migrated until + * the IRTE is updated and its metadata has been added to the + * list of IRQs being posted to the vCPU, to ensure the IRTE + * isn't programmed with stale pCPU/IsRunning information. + */ + guard(spinlock_irqsave)(&svm->ir_list_lock); - /** - * Here, we setup with legacy mode in the following cases: - * 1. When cannot target interrupt to a specific vcpu. - * 2. Unsetting posted interrupt. - * 3. APIC virtualization is disabled for the vcpu. - * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) + /* + * Update the target pCPU for IOMMU doorbells if the vCPU is + * running. If the vCPU is NOT running, i.e. is blocking or + * scheduled out, KVM will update the pCPU info when the vCPU + * is awakened and/or scheduled in. See also avic_vcpu_load(). */ - if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && - kvm_vcpu_apicv_active(&svm->vcpu)) { - struct amd_iommu_pi_data pi; - - enable_remapped_mode = false; - - /* Try to enable guest_mode in IRTE */ - pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & - AVIC_HPA_MASK); - pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, - svm->vcpu.vcpu_id); - pi.is_guest_mode = true; - pi.vcpu_data = &vcpu_info; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Here, we successfully setting up vcpu affinity in - * IOMMU guest mode. Now, we need to store the posted - * interrupt information in a per-vcpu ir_list so that - * we can reference to them directly when we update vcpu - * scheduling information in IOMMU irte. - */ - if (!ret && pi.is_guest_mode) - svm_ir_list_add(svm, &pi); + entry = svm->avic_physical_id_entry; + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { + pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + } else { + pi_data.cpu = -1; + pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; } - if (!ret && svm) { - trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, - e->gsi, vcpu_info.vector, - vcpu_info.pi_desc_addr, set); - } + ret = irq_set_vcpu_affinity(host_irq, &pi_data); + if (ret) + return ret; - if (ret < 0) { - pr_err("%s: failed to update PI IRTE\n", __func__); - goto out; + /* + * Revert to legacy mode if the IOMMU didn't provide metadata + * for the IRTE, which KVM needs to keep the IRTE up-to-date, + * e.g. if the vCPU is migrated or AVIC is disabled. + */ + if (WARN_ON_ONCE(!pi_data.ir_data)) { + irq_set_vcpu_affinity(host_irq, NULL); + return -EIO; } - } - ret = 0; - if (enable_remapped_mode) { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; + irqfd->irq_bypass_data = pi_data.ir_data; + list_add(&irqfd->vcpu_list, &svm->ir_list); + return 0; + } + return irq_set_vcpu_affinity(host_irq, NULL); +} - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); +enum avic_vcpu_action { + /* + * There is no need to differentiate between activate and deactivate, + * as KVM only refreshes AVIC state when the vCPU is scheduled in and + * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is + * being (de)activated. + */ + AVIC_TOGGLE_ON_OFF = BIT(0), + AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, + AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; + /* + * No unique action is required to deal with a vCPU that stops/starts + * running. A vCPU that starts running by definition stops blocking as + * well, and a vCPU that stops running can't have been blocking, i.e. + * doesn't need to toggle GALogIntr. + */ + AVIC_START_RUNNING = 0, + AVIC_STOP_RUNNING = 0, - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } - } -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} + /* + * When a vCPU starts blocking, KVM needs to set the GALogIntr flag + * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is + * sent to the vCPU. + */ + AVIC_START_BLOCKING = BIT(1), +}; -static inline int -avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) +static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, + enum avic_vcpu_action action) { - int ret = 0; - struct amd_svm_iommu_ir *ir; + bool ga_log_intr = (action & AVIC_START_BLOCKING); struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_kernel_irqfd *irqfd; lockdep_assert_held(&svm->ir_list_lock); - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - /* * Here, we go through the per-vcpu ir_list to update all existing * interrupt remapping table entry targeting this vcpu. */ if (list_empty(&svm->ir_list)) - return 0; + return; - list_for_each_entry(ir, &svm->ir_list, node) { - ret = amd_iommu_update_ga(cpu, r, ir->data); - if (ret) - return ret; + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { + void *data = irqfd->irq_bypass_data; + + if (!(action & AVIC_TOGGLE_ON_OFF)) + WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); + else if (cpu >= 0) + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); + else + WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); } - return 0; } -void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, + enum avic_vcpu_action action) { - u64 entry; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); unsigned long flags; + u64 entry; lockdep_assert_preemption_disabled(); if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; - /* - * No need to update anything if the vCPU is blocking, i.e. if the vCPU - * is being scheduled in after being preempted. The CPU entries in the - * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. - * If the vCPU was migrated, its new CPU value will be stuffed when the - * vCPU unblocks. - */ - if (kvm_vcpu_is_blocking(vcpu)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) return; /* @@ -1062,38 +895,57 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ spin_lock_irqsave(&svm->ir_list_lock, flags); - entry = READ_ONCE(*(svm->avic_physical_id_cache)); + entry = svm->avic_physical_id_entry; WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); - entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | + AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); + svm->avic_physical_id_entry = entry; + + /* + * If IPI virtualization is disabled, clear IsRunning when updating the + * actual Physical ID table, so that the CPU never sees IsRunning=1. + * Keep the APIC ID up-to-date in the entry to minimize the chances of + * things going sideways if hardware peeks at the ID. + */ + if (!enable_ipiv) + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); + + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); spin_unlock_irqrestore(&svm->ir_list_lock, flags); } -void avic_vcpu_put(struct kvm_vcpu *vcpu) +void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - u64 entry; + /* + * No need to update anything if the vCPU is blocking, i.e. if the vCPU + * is being scheduled in after being preempted. The CPU entries in the + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. + * If the vCPU was migrated, its new CPU value will be stuffed when the + * vCPU unblocks. + */ + if (kvm_vcpu_is_blocking(vcpu)) + return; + + __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); +} + +static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); unsigned long flags; + u64 entry = svm->avic_physical_id_entry; lockdep_assert_preemption_disabled(); - /* - * Note, reading the Physical ID entry outside of ir_list_lock is safe - * as only the pCPU that has loaded (or is loading) the vCPU is allowed - * to modify the entry, and preemption is disabled. I.e. the vCPU - * can't be scheduled out and thus avic_vcpu_{put,load}() can't run - * recursively. - */ - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - - /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ - if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) return; /* @@ -1106,13 +958,62 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) */ spin_lock_irqsave(&svm->ir_list_lock, flags); - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + avic_update_iommu_vcpu_affinity(vcpu, -1, action); + + WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); + /* + * Keep the previous APIC ID in the entry so that a rogue doorbell from + * hardware is at least restricted to a CPU associated with the vCPU. + */ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + + if (enable_ipiv) + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); + + /* + * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as + * it's a synthetic flag that usurps an unused should-be-zero bit. + */ + if (action & AVIC_START_BLOCKING) + entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; + + svm->avic_physical_id_entry = entry; spin_unlock_irqrestore(&svm->ir_list_lock, flags); +} + +void avic_vcpu_put(struct kvm_vcpu *vcpu) +{ + /* + * Note, reading the Physical ID entry outside of ir_list_lock is safe + * as only the pCPU that has loaded (or is loading) the vCPU is allowed + * to modify the entry, and preemption is disabled. I.e. the vCPU + * can't be scheduled out and thus avic_vcpu_{put,load}() can't run + * recursively. + */ + u64 entry = to_svm(vcpu)->avic_physical_id_entry; + + /* + * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the + * vCPU is preempted while its in the process of blocking. WARN if the + * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put + * the AVIC if it wasn't previously loaded. + */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { + if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) + return; + /* + * The vCPU was preempted while blocking, ensure its IRTEs are + * configured to generate GA Log Interrupts. + */ + if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) + return; + } + + __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : + AVIC_STOP_RUNNING); } void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) @@ -1141,19 +1042,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { - bool activated = kvm_vcpu_apicv_active(vcpu); - if (!enable_apicv) return; + /* APICv should only be toggled on/off while the vCPU is running. */ + WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); + avic_refresh_virtual_apic_mode(vcpu); - if (activated) - avic_vcpu_load(vcpu, vcpu->cpu); + if (kvm_vcpu_apicv_active(vcpu)) + __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); else - avic_vcpu_put(vcpu); - - avic_set_pi_irte_mode(vcpu, activated); + __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); } void avic_vcpu_blocking(struct kvm_vcpu *vcpu) @@ -1161,20 +1061,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu) if (!kvm_vcpu_apicv_active(vcpu)) return; - /* - * Unload the AVIC when the vCPU is about to block, _before_ - * the vCPU actually blocks. - * - * Any IRQs that arrive before IsRunning=0 will not cause an - * incomplete IPI vmexit on the source, therefore vIRR will also - * be checked by kvm_vcpu_check_block() before blocking. The - * memory barrier implicit in set_current_state orders writing - * IsRunning=0 before reading the vIRR. The processor needs a - * matching memory barrier on interrupt delivery between writing - * IRR and reading IsRunning; the lack of this barrier might be - * the cause of errata #1235). - */ - avic_vcpu_put(vcpu); + /* + * Unload the AVIC when the vCPU is about to block, _before_ the vCPU + * actually blocks. + * + * Note, any IRQs that arrive before IsRunning=0 will not cause an + * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles + * this by checking vIRR one last time before blocking. The memory + * barrier implicit in set_current_state orders writing IsRunning=0 + * before reading the vIRR. The processor needs a matching memory + * barrier on interrupt delivery between writing IRR and reading + * IsRunning; the lack of this barrier might be the cause of errata #1235). + * + * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM + * doesn't need to detect events for scheduling purposes. The doorbell + * used to signal running vCPUs cannot be blocked, i.e. will perturb the + * CPU and cause noisy neighbor problems if the VM is sending interrupts + * to the vCPU while it's scheduled out. + */ + __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); } void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) @@ -1227,6 +1132,14 @@ bool avic_hardware_setup(void) if (x2avic_enabled) pr_info("x2AVIC enabled\n"); + /* + * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) + * due to erratum 1235, which results in missed VM-Exits on the sender + * and thus missed wake events for blocking vCPUs due to the CPU + * failing to see a software update to clear IsRunning. + */ + enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); return true; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 834b67672d50..b7fd2e869998 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -185,12 +185,87 @@ void recalc_intercepts(struct vcpu_svm *svm) } /* + * This array (and its actual size) holds the set of offsets (indexing by chunk + * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the + * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g. + * based on CPUID features. This array only tracks MSRs that *might* be passed + * through to the guest. + * + * Hardcode the capacity of the array based on the maximum number of _offsets_. + * MSRs are batched together, so there are fewer offsets than MSRs. + */ +static int nested_svm_msrpm_merge_offsets[7] __ro_after_init; +static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; +typedef unsigned long nsvm_msrpm_merge_t; + +int __init nested_svm_init_msrpm_merge_offsets(void) +{ + static const u32 merge_msrs[] __initconst = { + MSR_STAR, + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_EIP, + MSR_IA32_SYSENTER_ESP, + #ifdef CONFIG_X86_64 + MSR_GS_BASE, + MSR_FS_BASE, + MSR_KERNEL_GS_BASE, + MSR_LSTAR, + MSR_CSTAR, + MSR_SYSCALL_MASK, + #endif + MSR_IA32_SPEC_CTRL, + MSR_IA32_PRED_CMD, + MSR_IA32_FLUSH_CMD, + MSR_IA32_APERF, + MSR_IA32_MPERF, + MSR_IA32_LASTBRANCHFROMIP, + MSR_IA32_LASTBRANCHTOIP, + MSR_IA32_LASTINTFROMIP, + MSR_IA32_LASTINTTOIP, + }; + int i, j; + + for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) { + int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]); + u32 offset; + + if (WARN_ON(bit_nr < 0)) + return -EIO; + + /* + * Merging is done in chunks to reduce the number of accesses + * to L1's bitmap. + */ + offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t); + + for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) { + if (nested_svm_msrpm_merge_offsets[j] == offset) + break; + } + + if (j < nested_svm_nr_msrpm_merge_offsets) + continue; + + if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets))) + return -EIO; + + nested_svm_msrpm_merge_offsets[j] = offset; + nested_svm_nr_msrpm_merge_offsets++; + } + + return 0; +} + +/* * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function * is optimized in that it only merges the parts where KVM MSR permission bitmap * may contain zero bits. */ -static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) +static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm; + nsvm_msrpm_merge_t *msrpm01 = svm->msrpm; int i; /* @@ -205,7 +280,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) if (!svm->nested.force_msr_bitmap_recalc) { struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; - if (kvm_hv_hypercall_enabled(&svm->vcpu) && + if (kvm_hv_hypercall_enabled(vcpu) && hve->hv_enlightenments_control.msr_bitmap && (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) goto set_msrpm_base_pa; @@ -215,25 +290,17 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; - for (i = 0; i < MSRPM_OFFSETS; i++) { - u32 value, p; - u64 offset; - - if (msrpm_offsets[i] == 0xffffffff) - break; - - p = msrpm_offsets[i]; + for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) { + const int p = nested_svm_msrpm_merge_offsets[i]; + nsvm_msrpm_merge_t l1_val; + gpa_t gpa; - /* x2apic msrs are intercepted always for the nested guest */ - if (is_x2apic_msrpm_offset(p)) - continue; - - offset = svm->nested.ctl.msrpm_base_pa + (p * 4); + gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val)); - if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) + if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val))) return false; - svm->nested.msrpm[p] = svm->msrpm[p] | value; + msrpm02[p] = msrpm01[p] | l1_val; } svm->nested.force_msr_bitmap_recalc = false; @@ -678,6 +745,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + /* + * Stash vmcb02's counter if the guest hasn't moved past the guilty + * instruction; otherwise, reset the counter to '0'. + * + * In order to detect if L2 has made forward progress or not, track the + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP + * is changed, guest has clearly made forward progress, bus_lock_counter + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the + * scenario, where a buslock happened in L1 before VMRUN, the bus lock + * firmly happened on an instruction in the past. Even if vmcb01's + * counter is still '1', (because the guilty instruction got patched), + * the vCPU has clearly made forward progress and so KVM should reset + * vmcb02's counter to '0'. + * + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. + * if userspace rate-limits the vCPU, then it's entirely possible that + * L1's tick interrupt is pending by the time userspace re-runs the + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the + * entire cycle start over. + */ + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + vmcb02->control.bus_lock_counter = 1; + else + vmcb02->control.bus_lock_counter = 0; + /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ @@ -910,7 +1004,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) goto out_exit_err; - if (nested_svm_vmrun_msrpm(svm)) + if (nested_svm_merge_msrpm(vcpu)) goto out; out_exit_err: @@ -1039,8 +1133,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } + /* + * Invalidate bus_lock_rip unless KVM is still waiting for the guest + * to make forward progress before re-enabling bus lock detection. + */ + if (!vmcb02->control.bus_lock_counter) + svm->nested.ctl.bus_lock_rip = INVALID_GPA; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + kvm_nested_vmexit_handle_ibrs(vcpu); + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1194,7 +1297,6 @@ int svm_allocate_nested(struct vcpu_svm *svm) svm->nested.msrpm = svm_vcpu_alloc_msrpm(); if (!svm->nested.msrpm) goto err_free_vmcb02; - svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); svm->nested.initialized = true; return 0; @@ -1254,26 +1356,26 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) { - u32 offset, msr, value; - int write, mask; + gpa_t base = svm->nested.ctl.msrpm_base_pa; + int write, bit_nr; + u8 value, mask; + u32 msr; if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - offset = svm_msrpm_offset(msr); + bit_nr = svm_msrpm_bit_nr(msr); write = svm->vmcb->control.exit_info_1 & 1; - mask = 1 << ((2 * (msr & 0xf)) + write); - if (offset == MSR_INVALID) + if (bit_nr < 0) return NESTED_EXIT_DONE; - /* Offset is in 32 bit units but need in 8 bit units */ - offset *= 4; - - if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE, + &value, sizeof(value))) return NESTED_EXIT_DONE; + mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1)); return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } @@ -1783,13 +1885,11 @@ out_free: static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - if (WARN_ON(!is_guest_mode(vcpu))) return true; if (!vcpu->arch.pdptrs_from_userspace && - !nested_npt_enabled(svm) && is_pae_paging(vcpu)) + !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu)) /* * Reload the guest's PDPTRs since after a migration * the guest CR3 might be restored prior to setting the nested @@ -1798,7 +1898,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; - if (!nested_svm_vmrun_msrpm(svm)) { + if (!nested_svm_merge_msrpm(vcpu)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0bc708ee2788..2fbdebf79fbb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -26,6 +26,7 @@ #include <asm/fpu/xcr.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> +#include <asm/msr.h> #include <asm/sev.h> #include "mmu.h" @@ -116,6 +117,7 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) */ down_write(&sev_deactivate_lock); + /* SNP firmware requires use of WBINVD for ASID recycling. */ wbinvd_on_all_cpus(); if (sev_snp_enabled) @@ -445,7 +447,12 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, init_args.probe = false; ret = sev_platform_init(&init_args); if (ret) - goto e_free; + goto e_free_asid; + + if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { + ret = -ENOMEM; + goto e_free_asid; + } /* This needs to happen after SEV/SNP firmware initialization. */ if (vm_type == KVM_X86_SNP_VM) { @@ -463,6 +470,8 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, return 0; e_free: + free_cpumask_var(sev->have_run_cpus); +e_free_asid: argp->error = init_args.error; sev_asid_free(sev); sev->asid = 0; @@ -560,6 +569,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; + sev->policy = params.policy; + memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -705,6 +716,33 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) } } +static void sev_writeback_caches(struct kvm *kvm) +{ + /* + * Note, the caller is responsible for ensuring correctness if the mask + * can be modified, e.g. if a CPU could be doing VMRUN. + */ + if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus)) + return; + + /* + * Ensure that all dirty guest tagged cache entries are written back + * before releasing the pages back to the system for use. CLFLUSH will + * not do this without SME_COHERENT, and flushing many cache lines + * individually is slower than blasting WBINVD for large VMs, so issue + * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported) + * on CPUs that have done VMRUN, i.e. may have dirtied data using the + * VM's ASID. + * + * For simplicity, never remove CPUs from the bitmap. Ideally, KVM + * would clear the mask when flushing caches, but doing so requires + * serializing multiple calls and having responding CPUs (to the IPI) + * mark themselves as still running if they are running (or about to + * run) a vCPU for the VM. + */ + wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus); +} + static unsigned long get_num_contig_pages(unsigned long idx, struct page **inpages, unsigned long npages) { @@ -1592,11 +1630,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL); if (!hdr) goto e_unpin; - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL); if (!trans_data) goto e_free_hdr; @@ -1882,70 +1920,6 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } -/* vCPU mutex subclasses. */ -enum sev_migration_role { - SEV_MIGRATION_SOURCE = 0, - SEV_MIGRATION_TARGET, - SEV_NR_MIGRATION_ROLES, -}; - -static int sev_lock_vcpus_for_migration(struct kvm *kvm, - enum sev_migration_role role) -{ - struct kvm_vcpu *vcpu; - unsigned long i, j; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable_nested(&vcpu->mutex, role)) - goto out_unlock; - -#ifdef CONFIG_PROVE_LOCKING - if (!i) - /* - * Reset the role to one that avoids colliding with - * the role used for the first vcpu mutex. - */ - role = SEV_NR_MIGRATION_ROLES; - else - mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); -#endif - } - - return 0; - -out_unlock: - - kvm_for_each_vcpu(j, vcpu, kvm) { - if (i == j) - break; - -#ifdef CONFIG_PROVE_LOCKING - if (j) - mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); -#endif - - mutex_unlock(&vcpu->mutex); - } - return -EINTR; -} - -static void sev_unlock_vcpus_for_migration(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - unsigned long i; - bool first = true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (first) - first = false; - else - mutex_acquire(&vcpu->mutex.dep_map, - SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); - - mutex_unlock(&vcpu->mutex); - } -} - static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); @@ -2032,6 +2006,10 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) struct kvm_vcpu *src_vcpu; unsigned long i; + if (src->created_vcpus != atomic_read(&src->online_vcpus) || + dst->created_vcpus != atomic_read(&dst->online_vcpus)) + return -EBUSY; + if (!sev_es_guest(src)) return 0; @@ -2083,10 +2061,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); + ret = kvm_lock_all_vcpus(kvm); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); + ret = kvm_lock_all_vcpus(source_kvm); if (ret) goto out_dst_vcpu; @@ -2094,15 +2072,26 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) if (ret) goto out_source_vcpu; + /* + * Allocate a new have_run_cpus for the destination, i.e. don't copy + * the set of CPUs from the source. If a CPU was used to run a vCPU in + * the source VM but is never used for the destination VM, then the CPU + * can only have cached memory that was accessible to the source VM. + */ + if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { + ret = -ENOMEM; + goto out_source_vcpu; + } + sev_migrate_from(kvm, source_kvm); kvm_vm_dead(source_kvm); cg_cleanup_sev = src_sev; ret = 0; out_source_vcpu: - sev_unlock_vcpus_for_migration(source_kvm); + kvm_unlock_all_vcpus(source_kvm); out_dst_vcpu: - sev_unlock_vcpus_for_migration(kvm); + kvm_unlock_all_vcpus(kvm); out_dst_cgroup: /* Operates on the source on success, on the destination on failure. */ if (charged) @@ -2192,12 +2181,10 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EINVAL; /* Check for policy bits that must be set */ - if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) || - !(params.policy & SNP_POLICY_MASK_SMT)) + if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO)) return -EINVAL; - if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) - return -EINVAL; + sev->policy = params.policy; sev->snp_context = snp_context_create(kvm, argp); if (!sev->snp_context) @@ -2753,12 +2740,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm, goto failed; } - /* - * Ensure that all guest tagged cache entries are flushed before - * releasing the pages back to the system for use. CLFLUSH will - * not do this, so issue a WBINVD. - */ - wbinvd_on_all_cpus(); + sev_writeback_caches(kvm); __unregister_enc_region_locked(kvm, region); @@ -2800,13 +2782,18 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) goto e_unlock; } + mirror_sev = to_kvm_sev_info(kvm); + if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { + ret = -ENOMEM; + goto e_unlock; + } + /* * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ source_sev = to_kvm_sev_info(source_kvm); kvm_get_kvm(source_kvm); - mirror_sev = to_kvm_sev_info(kvm); list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ @@ -2868,7 +2855,13 @@ void sev_vm_destroy(struct kvm *kvm) WARN_ON(!list_empty(&sev->mirror_vms)); - /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ + free_cpumask_var(sev->have_run_cpus); + + /* + * If this is a mirror VM, remove it from the owner's list of a mirrors + * and skip ASID cleanup (the ASID is tied to the lifetime of the owner). + * Note, mirror VMs don't support registering encrypted regions. + */ if (is_mirroring_enc_context(kvm)) { struct kvm *owner_kvm = sev->enc_context_owner; @@ -2879,12 +2872,6 @@ void sev_vm_destroy(struct kvm *kvm) return; } - /* - * Ensure that all guest tagged cache entries are flushed before - * releasing the pages back to the system for use. CLFLUSH will - * not do this, so issue a WBINVD. - */ - wbinvd_on_all_cpus(); /* * if userspace was terminated before unregistering the memory regions @@ -2930,9 +2917,37 @@ void __init sev_set_cpu_caps(void) } } +static bool is_sev_snp_initialized(void) +{ + struct sev_user_data_snp_status *status; + struct sev_data_snp_addr buf; + bool initialized = false; + int ret, error = 0; + + status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); + if (!status) + return false; + + buf.address = __psp_pa(status); + ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); + if (ret) { + pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + goto out; + } + + initialized = !!status->state; + +out: + snp_free_firmware_page(status); + + return initialized; +} + void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + struct sev_platform_init_args init_args = {0}; bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -3033,6 +3048,14 @@ void __init sev_hardware_setup(void) sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: + if (sev_enabled) { + init_args.probe = true; + if (sev_platform_init(&init_args)) + sev_supported = sev_es_supported = sev_snp_supported = false; + else if (sev_snp_supported) + sev_snp_supported = is_sev_snp_initialized(); + } + if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : @@ -3074,6 +3097,8 @@ void sev_hardware_unsetup(void) misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); + + sev_platform_shutdown(); } int sev_cpu_init(struct svm_cpu_data *sd) @@ -3116,30 +3141,29 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) /* * VM Page Flush takes a host virtual address and a guest ASID. Fall - * back to WBINVD if this faults so as not to make any problems worse - * by leaving stale encrypted data in the cache. + * back to full writeback of caches if this faults so as not to make + * any problems worse by leaving stale encrypted data in the cache. */ - if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) - goto do_wbinvd; + if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + goto do_sev_writeback_caches; return; -do_wbinvd: - wbinvd_on_all_cpus(); +do_sev_writeback_caches: + sev_writeback_caches(vcpu->kvm); } void sev_guest_memory_reclaimed(struct kvm *kvm) { /* * With SNP+gmem, private/encrypted memory is unreachable via the - * hva-based mmu notifiers, so these events are only actually - * pertaining to shared pages where there is no need to perform - * the WBINVD to flush associated caches. + * hva-based mmu notifiers, i.e. these events are explicitly scoped to + * shared pages, where there's no need to flush caches. */ if (!sev_guest(kvm) || sev_snp_guest(kvm)) return; - wbinvd_on_all_cpus(); + sev_writeback_caches(kvm); } void sev_free_vcpu(struct kvm_vcpu *vcpu) @@ -3173,9 +3197,14 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -3184,18 +3213,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -3266,11 +3301,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3465,6 +3495,15 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu) if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) return -EINVAL; + /* + * To optimize cache flushes when memory is reclaimed from an SEV VM, + * track physical CPUs that enter the guest for SEV VMs and thus can + * have encrypted, dirty data in the cache, and flush caches only for + * CPUs that have entered the guest. + */ + if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus)) + cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus); + /* Assign the asid allocated with this SEV guest */ svm->asid = asid; @@ -3897,9 +3936,9 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) * From this point forward, the VMSA will always be a guest-mapped page * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but - * that involves cleanups like wbinvd_on_all_cpus() which would ideally - * be handled during teardown rather than guest boot. Deferring that - * also allows the existing logic for SEV-ES VMSAs to be re-used with + * that involves cleanups like flushing caches, which would ideally be + * handled during teardown rather than guest boot. Deferring that also + * allows the existing logic for SEV-ES VMSAs to be re-used with * minimal SNP-specific changes. */ svm->sev_es.snp_has_guest_vmsa = true; @@ -3988,10 +4027,8 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) * Unless Creation is deferred until INIT, signal the vCPU to update * its state. */ - if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) { - kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); - kvm_vcpu_kick(target_vcpu); - } + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); return 0; } @@ -4403,16 +4440,17 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) count, in); } -static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) +void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + /* Clear intercepts on MSRs that are context switched by hardware. */ + svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW); - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || - guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); - - set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); - } + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) + svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW, + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); /* * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if @@ -4426,11 +4464,9 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) * XSAVES being exposed to the guest so that KVM can at least honor * guest CPUID for RDMSR and WRMSR. */ - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); - else - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0); + svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW, + !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) || + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)); } void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) @@ -4442,15 +4478,12 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); - - if (sev_es_guest(svm->vcpu.kvm)) - sev_es_vcpu_after_set_cpuid(svm); } static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; - struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; @@ -4461,8 +4494,16 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) * the VMSA will be NULL if this vCPU is the destination for intrahost * migration, and will be copied later. */ - if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) - svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (!svm->sev_es.snp_has_guest_vmsa) { + if (svm->sev_es.vmsa) + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + else + svm->vmcb->control.vmsa_pa = INVALID_PAGE; + } + + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | + VMCB_ALLOWED_SEV_FEATURES_VALID; /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); @@ -4500,10 +4541,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); - - /* Clear intercepts on selected MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); } void sev_init_vmcb(struct vcpu_svm *svm) @@ -4892,7 +4929,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) /* * SEV-ES avoids host/guest cache coherency issues through - * WBINVD hooks issued via MMU notifiers during run-time, and + * WBNOINVD hooks issued via MMU notifiers during run-time, and * KVM's VM destroy path at shutdown. Those MMU notifier events * don't cover gmem since there is no requirement to map pages * to a HVA in order to use them for a running guest. While the @@ -4924,3 +4961,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return level; } + +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area *vmsa; + struct kvm_sev_info *sev; + int error = 0; + int ret; + + if (!sev_es_guest(vcpu->kvm)) + return NULL; + + /* + * If the VMSA has not yet been encrypted, return a pointer to the + * current un-encrypted VMSA. + */ + if (!vcpu->arch.guest_state_protected) + return (struct vmcb_save_area *)svm->sev_es.vmsa; + + sev = to_kvm_sev_info(vcpu->kvm); + + /* Check if the SEV policy allows debugging */ + if (sev_snp_guest(vcpu->kvm)) { + if (!(sev->policy & SNP_POLICY_DEBUG)) + return NULL; + } else { + if (sev->policy & SEV_POLICY_NODBG) + return NULL; + } + + if (sev_snp_guest(vcpu->kvm)) { + struct sev_data_snp_dbg dbg = {0}; + + vmsa = snp_alloc_firmware_page(__GFP_ZERO); + if (!vmsa) + return NULL; + + dbg.gctx_paddr = __psp_pa(sev->snp_context); + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); + + /* + * Return the target page to a hypervisor page no matter what. + * If this fails, the page can't be used, so leak it and don't + * try to use it. + */ + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) + return NULL; + + if (ret) { + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + free_page((unsigned long)vmsa); + + return NULL; + } + } else { + struct sev_data_dbg dbg = {0}; + struct page *vmsa_page; + + vmsa_page = alloc_page(GFP_KERNEL); + if (!vmsa_page) + return NULL; + + vmsa = page_address(vmsa_page); + + dbg.handle = sev->handle; + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + dbg.len = PAGE_SIZE; + + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); + if (ret) { + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", + ret, error, error); + __free_page(vmsa_page); + + return NULL; + } + } + + return vmsa; +} + +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) +{ + /* If the VMSA has not yet been encrypted, nothing was allocated */ + if (!vcpu->arch.guest_state_protected || !vmsa) + return; + + free_page((unsigned long)vmsa); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d5d0c5c3300b..d9931c6c4bc6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -29,8 +29,10 @@ #include <linux/cc_platform.h> #include <linux/smp.h> #include <linux/string_choices.h> +#include <linux/mutex.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -70,8 +72,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); static bool erratum_383_found __read_mostly; -u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; - /* * Set osvw_len to higher value when updated Revision Guides * are published and we know what the new status bits are @@ -80,72 +80,6 @@ static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) - -static const struct svm_direct_access_msrs { - u32 index; /* Index of the MSR */ - bool always; /* True if intercept is initially cleared */ -} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { - { .index = MSR_STAR, .always = true }, - { .index = MSR_IA32_SYSENTER_CS, .always = true }, - { .index = MSR_IA32_SYSENTER_EIP, .always = false }, - { .index = MSR_IA32_SYSENTER_ESP, .always = false }, -#ifdef CONFIG_X86_64 - { .index = MSR_GS_BASE, .always = true }, - { .index = MSR_FS_BASE, .always = true }, - { .index = MSR_KERNEL_GS_BASE, .always = true }, - { .index = MSR_LSTAR, .always = true }, - { .index = MSR_CSTAR, .always = true }, - { .index = MSR_SYSCALL_MASK, .always = true }, -#endif - { .index = MSR_IA32_SPEC_CTRL, .always = false }, - { .index = MSR_IA32_PRED_CMD, .always = false }, - { .index = MSR_IA32_FLUSH_CMD, .always = false }, - { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, - { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, - { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, - { .index = MSR_IA32_LASTINTFROMIP, .always = false }, - { .index = MSR_IA32_LASTINTTOIP, .always = false }, - { .index = MSR_IA32_XSS, .always = false }, - { .index = MSR_EFER, .always = false }, - { .index = MSR_IA32_CR_PAT, .always = false }, - { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, - { .index = MSR_TSC_AUX, .always = false }, - { .index = X2APIC_MSR(APIC_ID), .always = false }, - { .index = X2APIC_MSR(APIC_LVR), .always = false }, - { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, - { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, - { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, - { .index = X2APIC_MSR(APIC_EOI), .always = false }, - { .index = X2APIC_MSR(APIC_RRR), .always = false }, - { .index = X2APIC_MSR(APIC_LDR), .always = false }, - { .index = X2APIC_MSR(APIC_DFR), .always = false }, - { .index = X2APIC_MSR(APIC_SPIV), .always = false }, - { .index = X2APIC_MSR(APIC_ISR), .always = false }, - { .index = X2APIC_MSR(APIC_TMR), .always = false }, - { .index = X2APIC_MSR(APIC_IRR), .always = false }, - { .index = X2APIC_MSR(APIC_ESR), .always = false }, - { .index = X2APIC_MSR(APIC_ICR), .always = false }, - { .index = X2APIC_MSR(APIC_ICR2), .always = false }, - - /* - * Note: - * AMD does not virtualize APIC TSC-deadline timer mode, but it is - * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, - * the AVIC hardware would generate GP fault. Therefore, always - * intercept the MSR 0x832, and do not setup direct_access_msr. - */ - { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, - { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, - { .index = X2APIC_MSR(APIC_LVT0), .always = false }, - { .index = X2APIC_MSR(APIC_LVT1), .always = false }, - { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, - { .index = X2APIC_MSR(APIC_TMICT), .always = false }, - { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, - { .index = X2APIC_MSR(APIC_TDCR), .always = false }, - { .index = MSR_INVALID, .always = false }, -}; - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -230,6 +164,9 @@ module_param(tsc_scaling, int, 0444); */ static bool avic; module_param(avic, bool, 0444); +module_param(enable_ipiv, bool, 0444); + +module_param(enable_device_posted_irqs, bool, 0444); bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -249,6 +186,8 @@ static unsigned long iopm_base; DEFINE_PER_CPU(struct svm_cpu_data, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); + /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. @@ -258,33 +197,6 @@ DEFINE_PER_CPU(struct svm_cpu_data, svm_data); */ static int tsc_aux_uret_slot __read_mostly = -1; -static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; - -#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) -#define MSRS_RANGE_SIZE 2048 -#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) - -u32 svm_msrpm_offset(u32 msr) -{ - u32 offset; - int i; - - for (i = 0; i < NUM_MSR_MAPS; i++) { - if (msr < msrpm_ranges[i] || - msr >= msrpm_ranges[i] + MSRS_IN_RANGE) - continue; - - offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ - offset += (i * MSRS_RANGE_SIZE); /* add range offset */ - - /* Now we have the u8 offset - but need the u32 offset */ - return offset / 4; - } - - /* MSR not in any range */ - return MSR_INVALID; -} - static int get_npt_level(void) { #ifdef CONFIG_X86_64 @@ -475,24 +387,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) static void svm_init_erratum_383(void) { - u32 low, high; - int err; u64 val; if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) + if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) return; val |= (1ULL << 47); - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + native_write_msr_safe(MSR_AMD64_DC_CFG, val); erratum_383_found = true; } @@ -566,7 +472,7 @@ static void __svm_write_tsc_multiplier(u64 multiplier) if (multiplier == __this_cpu_read(current_tsc_ratio)) return; - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); } @@ -579,15 +485,15 @@ static inline void kvm_cpu_svm_disable(void) { uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); + wrmsrq(MSR_VM_HSAVE_PA, 0); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) { /* * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and * NMI aren't blocked. */ stgi(); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); } } @@ -607,9 +513,6 @@ static void svm_disable_virtualization_cpu(void) kvm_cpu_svm_disable(); amd_pmu_disable_virt(); - - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); } static int svm_enable_virtualization_cpu(void) @@ -619,7 +522,7 @@ static int svm_enable_virtualization_cpu(void) uint64_t efer; int me = raw_smp_processor_id(); - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; @@ -629,9 +532,9 @@ static int svm_enable_virtualization_cpu(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - wrmsrl(MSR_EFER, efer | EFER_SVME); + wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); + wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -652,13 +555,12 @@ static int svm_enable_virtualization_cpu(void) * erratum is present everywhere). */ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; + u64 len, status = 0; int err; - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); if (err) osvw_status = osvw_len = 0; @@ -687,9 +589,6 @@ static int svm_enable_virtualization_cpu(void) rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); - return 0; } @@ -764,50 +663,8 @@ static void clr_dr_intercepts(struct vcpu_svm *svm) recalc_intercepts(svm); } -static int direct_access_msr_slot(u32 msr) -{ - u32 i; - - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == msr) - return i; - - return -ENOENT; -} - -static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, - int write) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int slot = direct_access_msr_slot(msr); - - if (slot == -ENOENT) - return; - - /* Set the shadow bitmaps to the desired intercept states */ - if (read) - set_bit(slot, svm->shadow_msr_intercept.read); - else - clear_bit(slot, svm->shadow_msr_intercept.read); - - if (write) - set_bit(slot, svm->shadow_msr_intercept.write); - else - clear_bit(slot, svm->shadow_msr_intercept.write); -} - -static bool valid_msr_intercept(u32 index) -{ - return direct_access_msr_slot(index) != -ENOENT; -} - static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) { - u8 bit_write; - unsigned long tmp; - u32 offset; - u32 *msrpm; - /* * For non-nested case: * If the L01 MSR bitmap does not intercept the MSR, then we need to @@ -817,90 +674,102 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: - to_svm(vcpu)->msrpm; - - offset = svm_msrpm_offset(msr); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - BUG_ON(offset == MSR_INVALID); + void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm : + to_svm(vcpu)->msrpm; - return test_bit(bit_write, &tmp); + return svm_test_msr_bitmap_write(msrpm, msr); } -static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, - u32 msr, int read, int write) +void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) { struct vcpu_svm *svm = to_svm(vcpu); - u8 bit_read, bit_write; - unsigned long tmp; - u32 offset; - - /* - * If this warning triggers extend the direct_access_msrs list at the - * beginning of the file - */ - WARN_ON(!valid_msr_intercept(msr)); + void *msrpm = svm->msrpm; - /* Enforce non allowed MSRs to trap */ - if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) - read = 0; - - if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) - write = 0; - - offset = svm_msrpm_offset(msr); - bit_read = 2 * (msr & 0x0f); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - BUG_ON(offset == MSR_INVALID); - - read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); - write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); + /* Don't disable interception for MSRs userspace wants to handle. */ + if (type & MSR_TYPE_R) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + svm_clear_msr_bitmap_read(msrpm, msr); + else + svm_set_msr_bitmap_read(msrpm, msr); + } - msrpm[offset] = tmp; + if (type & MSR_TYPE_W) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + svm_clear_msr_bitmap_write(msrpm, msr); + else + svm_set_msr_bitmap_write(msrpm, msr); + } svm_hv_vmcb_dirty_nested_enlightenments(vcpu); svm->nested.force_msr_bitmap_recalc = true; } -void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, - int read, int write) -{ - set_shadow_msr_intercept(vcpu, msr, read, write); - set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); -} - -u32 *svm_vcpu_alloc_msrpm(void) +void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) { - unsigned int order = get_order(MSRPM_SIZE); - struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); - u32 *msrpm; + unsigned int order = get_order(size); + struct page *pages = alloc_pages(gfp_mask, order); + void *pm; if (!pages) return NULL; - msrpm = page_address(pages); - memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); + /* + * Set all bits in the permissions map so that all MSR and I/O accesses + * are intercepted by default. + */ + pm = page_address(pages); + memset(pm, 0xff, PAGE_SIZE * (1 << order)); - return msrpm; + return pm; } -void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) +static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) { - int i; + bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - if (!direct_access_msrs[i].always) - continue; - set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); - } + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept); + + if (sev_es_guest(vcpu->kvm)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); } void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) { + static const u32 x2avic_passthrough_msrs[] = { + X2APIC_MSR(APIC_ID), + X2APIC_MSR(APIC_LVR), + X2APIC_MSR(APIC_TASKPRI), + X2APIC_MSR(APIC_ARBPRI), + X2APIC_MSR(APIC_PROCPRI), + X2APIC_MSR(APIC_EOI), + X2APIC_MSR(APIC_RRR), + X2APIC_MSR(APIC_LDR), + X2APIC_MSR(APIC_DFR), + X2APIC_MSR(APIC_SPIV), + X2APIC_MSR(APIC_ISR), + X2APIC_MSR(APIC_TMR), + X2APIC_MSR(APIC_IRR), + X2APIC_MSR(APIC_ESR), + X2APIC_MSR(APIC_ICR), + X2APIC_MSR(APIC_ICR2), + + /* + * Note! Always intercept LVTT, as TSC-deadline timer mode + * isn't virtualized by hardware, and the CPU will generate a + * #GP instead of a #VMEXIT. + */ + X2APIC_MSR(APIC_LVTTHMR), + X2APIC_MSR(APIC_LVTPC), + X2APIC_MSR(APIC_LVT0), + X2APIC_MSR(APIC_LVT1), + X2APIC_MSR(APIC_LVTERR), + X2APIC_MSR(APIC_TMICT), + X2APIC_MSR(APIC_TMCCT), + X2APIC_MSR(APIC_TDCR), + }; int i; if (intercept == svm->x2avic_msrs_intercepted) @@ -909,84 +778,79 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if (!x2avic_enabled) return; - for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { - int index = direct_access_msrs[i].index; - - if ((index < APIC_BASE_MSR) || - (index > APIC_BASE_MSR + 0xff)) - continue; - set_msr_interception(&svm->vcpu, svm->msrpm, index, - !intercept, !intercept); - } + for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) + svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], + MSR_TYPE_RW, intercept); svm->x2avic_msrs_intercepted = intercept; } -void svm_vcpu_free_msrpm(u32 *msrpm) +void svm_vcpu_free_msrpm(void *msrpm) { __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); } -static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) +static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - u32 i; - - /* - * Set intercept permissions for all direct access MSRs again. They - * will automatically get filtered through the MSR filter, so we are - * back in sync after this. - */ - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 msr = direct_access_msrs[i].index; - u32 read = test_bit(i, svm->shadow_msr_intercept.read); - u32 write = test_bit(i, svm->shadow_msr_intercept.write); - - set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); - } -} -static void add_msr_offset(u32 offset) -{ - int i; + svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - for (i = 0; i < MSRPM_OFFSETS; ++i) { - - /* Offset already in list? */ - if (msrpm_offsets[i] == offset) - return; +#ifdef CONFIG_X86_64 + svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW); +#endif - /* Slot used by another offset? */ - if (msrpm_offsets[i] != MSR_INVALID) - continue; + if (lbrv) + svm_recalc_lbr_msr_intercepts(vcpu); - /* Add offset to list */ - msrpm_offsets[i] = offset; + if (cpu_feature_enabled(X86_FEATURE_IBPB)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, + !guest_has_pred_cmd_msr(vcpu)); - return; - } + if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); /* - * If this BUG triggers the msrpm_offsets table has an overflow. Just - * increase MSRPM_OFFSETS in this case. + * Disable interception of SPEC_CTRL if KVM doesn't need to manually + * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if + * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively + * using SPEC_CTRL. */ - BUG(); -} - -static void init_msrpm_offsets(void) -{ - int i; - - memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); + if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !guest_has_spec_ctrl_msr(vcpu)); + else + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !svm->spec_ctrl); - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 offset; + /* + * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU, + * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits. + */ + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW, + guest_cpuid_is_intel_compatible(vcpu)); + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW, + guest_cpuid_is_intel_compatible(vcpu)); + + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { + svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); + } - offset = svm_msrpm_offset(direct_access_msrs[i].index); - BUG_ON(offset == MSR_INVALID); + if (sev_es_guest(vcpu->kvm)) + sev_es_recalc_msr_intercepts(vcpu); - add_msr_offset(offset); - } + /* + * x2APIC intercepts are modified on-demand and cannot be filtered by + * userspace. + */ } void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) @@ -1005,13 +869,7 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); - - if (sev_es_guest(vcpu->kvm)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); + svm_recalc_lbr_msr_intercepts(vcpu); /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ if (is_guest_mode(vcpu)) @@ -1023,12 +881,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); - svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + svm_recalc_lbr_msr_intercepts(vcpu); /* * Move the LBR msrs back to the vmcb01 to avoid copying them @@ -1183,9 +1037,10 @@ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) } /* Evaluate instruction intercepts that depend on guest CPUID features. */ -static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, - struct vcpu_svm *svm) +static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + /* * Intercept INVPCID if shadow paging is enabled to sync/free shadow * roots, or if INVPCID is disabled in the guest to inject #UD. @@ -1204,24 +1059,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, else svm_set_intercept(svm, INTERCEPT_RDTSCP); } -} - -static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); if (guest_cpuid_is_intel_compatible(vcpu)) { - /* - * We must intercept SYSENTER_EIP and SYSENTER_ESP - * accesses because the processor only stores 32 bits. - * For the same reason we cannot use virtual VMLOAD/VMSAVE. - */ svm_set_intercept(svm, INTERCEPT_VMLOAD); svm_set_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1232,12 +1074,15 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } - /* No need to intercept these MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); } } +static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + svm_recalc_instruction_intercepts(vcpu); + svm_recalc_msr_intercepts(vcpu); +} + static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1360,15 +1205,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_PAUSE); } - svm_recalc_instruction_intercepts(vcpu, svm); - - /* - * If the host supports V_SPEC_CTRL then disable the interception - * of MSR_IA32_SPEC_CTRL. - */ - if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); - if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); @@ -1381,11 +1217,15 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) sev_init_vmcb(svm); svm_hv_init_vmcb(vmcb); - init_vmcb_after_set_cpuid(vcpu); + + svm_recalc_intercepts_after_set_cpuid(vcpu); vmcb_mark_all_dirty(vmcb); @@ -1396,8 +1236,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm_init_osvw(vcpu); if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) @@ -1490,24 +1328,11 @@ out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) -{ - int i; - - for_each_online_cpu(i) - cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); -} - static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. - */ - svm_clear_current_vmcb(svm->vmcb); + WARN_ON_ONCE(!list_empty(&svm->ir_list)); svm_leave_nested(vcpu); svm_free_nested(svm); @@ -1515,9 +1340,66 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) sev_free_vcpu(vcpu); __free_page(__sme_pa_to_page(svm->vmcb01.pa)); - __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); + svm_vcpu_free_msrpm(svm->msrpm); } +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) +{ + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + /* + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. + */ + if (atomic_read(&srso_nr_vms)) + return; + + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} + +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); +} +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1550,6 +1432,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -1560,19 +1447,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; - - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) && - static_branch_likely(&switch_vcpu_ibpb)) - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } @@ -2149,14 +2026,13 @@ static int ac_interception(struct kvm_vcpu *vcpu) static bool is_erratum_383(void) { - int err, i; + int i; u64 value; if (!erratum_383_found) return false; - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) + if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) return false; /* Bit 62 may or may not be set for this mce */ @@ -2167,17 +2043,11 @@ static bool is_erratum_383(void) /* Clear MCi_STATUS registers */ for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); + if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + native_write_msr_safe(MSR_IA32_MCG_STATUS, value); } /* Flush tlb to evict multi-match entries */ @@ -2231,6 +2101,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -2850,12 +2724,11 @@ static int svm_get_feature_msr(u32 msr, u64 *data) return 0; } -static bool -sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, + struct msr_data *msr_info) { return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && - svm_msrpm_offset(msr_info->index) != MSR_INVALID && !msr_write_intercepted(vcpu, msr_info->index); } @@ -3086,11 +2959,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_svm_vmrun_msrpm. + * nested_svm_merge_msrpm(). * We update the L1 MSR bit as well since it will end up * touching the MSR anyway now. */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && @@ -3156,8 +3029,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) /* * TSC_AUX is usually changed only during boot and never read - * directly. Intercept TSC_AUX instead of exposing it to the - * guest via direct_access_msrs, and switch it via user return. + * directly. Intercept TSC_AUX and switch it via user return. */ preempt_disable(); ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); @@ -3174,17 +3046,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) } /* - * AMD changed the architectural behavior of bits 5:2. On CPUs - * without BusLockTrap, bits 5:2 control "external pins", but - * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap - * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed - * the guest to set bits 5:2 despite not actually virtualizing - * Performance-Monitoring/Breakpoint external pins. Drop bits - * 5:2 for backwards compatibility. - */ - data &= ~GENMASK(5, 2); - - /* * Suppress BTF as KVM doesn't virtualize BTF, but there's no * way to communicate lack of support to the guest. */ @@ -3314,6 +3175,37 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3383,6 +3275,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, @@ -3397,14 +3290,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + guard(mutex)(&vmcb_dump_mutex); + + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3442,6 +3342,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3512,6 +3423,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) @@ -3548,6 +3516,10 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } @@ -4259,9 +4231,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_exit_irqoff(); } -static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, - bool force_immediate_exit) +static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_svm *svm = to_svm(vcpu); bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); @@ -4308,10 +4280,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, svm_hv_update_vp_id(svm->vmcb, vcpu); /* - * Run with all-zero DR6 unless needed, so that we can get the exact cause - * of a #DB. + * Run with all-zero DR6 unless the guest can write DR6 freely, so that + * KVM can get the exact cause of a #DB. Note, loading guest DR6 from + * KVM's snapshot is only necessary when DR accesses won't exit. */ - if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) + svm_set_dr6(vcpu, vcpu->arch.dr6); + else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); @@ -4491,20 +4466,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (guest_cpuid_is_intel_compatible(vcpu)) guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - svm_recalc_instruction_intercepts(vcpu, svm); - - if (boot_cpu_has(X86_FEATURE_IBPB)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, - !!guest_has_pred_cmd_msr(vcpu)); - - if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, - !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); - if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); - init_vmcb_after_set_cpuid(vcpu); + svm_recalc_intercepts_after_set_cpuid(vcpu); } static bool svm_has_wbinvd_exit(void) @@ -5036,6 +5001,8 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) @@ -5053,7 +5020,7 @@ static int svm_vm_init(struct kvm *kvm) } if (!pause_filter_count || !pause_filter_thresh) - kvm->arch.pause_in_guest = true; + kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); if (enable_apicv) { int ret = avic_vm_init(kvm); @@ -5061,6 +5028,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } @@ -5119,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, @@ -5204,7 +5171,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .apic_init_signal_blocked = svm_apic_init_signal_blocked, - .msr_filter_changed = svm_msr_filter_changed, + .recalc_msr_intercepts = svm_recalc_msr_intercepts, .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, @@ -5232,7 +5199,7 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; @@ -5306,6 +5273,9 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) @@ -5337,11 +5307,8 @@ static __init void svm_set_cpu_caps(void) static __init int svm_hardware_setup(void) { - int cpu; - struct page *iopm_pages; void *iopm_va; - int r; - unsigned int order = get_order(IOPM_SIZE); + int cpu, r; /* * NX is required for shadow paging and for NPT if the NX huge pages @@ -5353,17 +5320,6 @@ static __init int svm_hardware_setup(void) } kvm_enable_efer_bits(EFER_NX); - iopm_pages = alloc_pages(GFP_KERNEL, order); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = __sme_page_pa(iopm_pages); - - init_msrpm_offsets(); - kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); @@ -5397,6 +5353,10 @@ static __init int svm_hardware_setup(void) if (nested) { pr_info("Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + + r = nested_svm_init_msrpm_merge_offsets(); + if (r) + return r; } /* @@ -5428,6 +5388,13 @@ static __init int svm_hardware_setup(void) else pr_info("LBR virtualization supported\n"); } + + iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL); + if (!iopm_va) + return -ENOMEM; + + iopm_base = __sme_set(__pa(iopm_va)); + /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which * may be modified by svm_adjust_mmio_mask()), as well as nrips. @@ -5445,6 +5412,7 @@ static __init int svm_hardware_setup(void) enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { + enable_ipiv = false; svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; @@ -5501,6 +5469,7 @@ static __init int svm_hardware_setup(void) */ allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; err: @@ -5525,6 +5494,8 @@ static int __init svm_init(void) { int r; + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm); + __unused_size_checks(); if (!kvm_is_svm_supported()) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d4490eaed55d..58b9d168e0c8 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -44,9 +44,6 @@ static inline struct page *__sme_pa_to_page(unsigned long pa) #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 48 -#define MSRPM_OFFSETS 32 -extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int nrips; extern int vgif; @@ -98,6 +95,7 @@ struct kvm_sev_info { unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long policy; unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ @@ -112,15 +110,19 @@ struct kvm_sev_info { void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */ void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ + cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ }; +#define SEV_POLICY_NODBG BIT_ULL(0) +#define SNP_POLICY_DEBUG BIT_ULL(19) + struct kvm_svm { struct kvm kvm; /* Struct members for AVIC */ u32 avic_vm_id; - struct page *avic_logical_id_table_page; - struct page *avic_physical_id_table_page; + u32 *avic_logical_id_table; + u64 *avic_physical_id_table; struct hlist_node hnode; struct kvm_sev_info sev_info; @@ -169,6 +171,7 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; + u64 bus_lock_rip; union { #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; @@ -184,8 +187,11 @@ struct svm_nested_state { u64 vmcb12_gpa; u64 last_vmcb12_gpa; - /* These are the merged vectors */ - u32 *msrpm; + /* + * The MSR permissions map used for vmcb02, which is the merge result + * of vmcb01 and vmcb12 + */ + void *msrpm; /* A VMRUN has started but has not yet been performed, so * we cannot inject a nested vmexit yet. */ @@ -266,7 +272,7 @@ struct vcpu_svm { */ u64 virt_spec_ctrl; - u32 *msrpm; + void *msrpm; ulong nmi_iret_rip; @@ -301,24 +307,26 @@ struct vcpu_svm { u32 ldr_reg; u32 dfr_reg; - struct page *avic_backing_page; - u64 *avic_physical_id_cache; + + /* This is essentially a shadow of the vCPU's actual entry in the + * Physical ID table that is programmed into the VMCB, i.e. that is + * seen by the CPU. If IPI virtualization is disabled, IsRunning is + * only ever set in the shadow, i.e. is never propagated to the "real" + * table, so that hardware never sees IsRunning=1. + */ + u64 avic_physical_id_entry; /* - * Per-vcpu list of struct amd_svm_iommu_ir: - * This is used mainly to store interrupt remapping information used - * when update the vcpu affinity. This avoids the need to scan for - * IRTE and try to match ga_tag in the IOMMU driver. + * Per-vCPU list of irqfds that are eligible to post IRQs directly to + * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list + * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the + * target pCPU), when AVIC is toggled on/off (to (de)activate bypass), + * and if the irqfd becomes ineligible for posting (to put the IRTE + * back into remapped mode). */ struct list_head ir_list; spinlock_t ir_list_lock; - /* Save desired MSR intercept (read: pass-through) state */ - struct { - DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS); - DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); - } shadow_msr_intercept; - struct vcpu_sev_es_state sev_es; bool guest_state_loaded; @@ -335,11 +343,11 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; + bool bp_spec_reduce_set; + struct vmcb *save_area; unsigned long save_area_pa; - struct vmcb *current_vmcb; - /* index = sev_asid, value = vmcb pointer */ struct vmcb **sev_vmcbs; }; @@ -608,17 +616,74 @@ static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data) svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); } -/* svm.c */ -#define MSR_INVALID 0xffffffffU +/* + * The MSRPM is 8KiB in size, divided into four 2KiB ranges (the fourth range + * is reserved). Each MSR within a range is covered by two bits, one each for + * read (bit 0) and write (bit 1), where a bit value of '1' means intercepted. + */ +#define SVM_MSRPM_BYTES_PER_RANGE 2048 +#define SVM_BITS_PER_MSR 2 +#define SVM_MSRS_PER_BYTE (BITS_PER_BYTE / SVM_BITS_PER_MSR) +#define SVM_MSRS_PER_RANGE (SVM_MSRPM_BYTES_PER_RANGE * SVM_MSRS_PER_BYTE) +static_assert(SVM_MSRS_PER_RANGE == 8192); +#define SVM_MSRPM_OFFSET_MASK (SVM_MSRS_PER_RANGE - 1) + +static __always_inline int svm_msrpm_bit_nr(u32 msr) +{ + int range_nr; + + switch (msr & ~SVM_MSRPM_OFFSET_MASK) { + case 0: + range_nr = 0; + break; + case 0xc0000000: + range_nr = 1; + break; + case 0xc0010000: + range_nr = 2; + break; + default: + return -EINVAL; + } + + return range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE + + (msr & SVM_MSRPM_OFFSET_MASK) * SVM_BITS_PER_MSR; +} + +#define __BUILD_SVM_MSR_BITMAP_HELPER(rtype, action, bitop, access, bit_rw) \ +static inline rtype svm_##action##_msr_bitmap_##access(unsigned long *bitmap, \ + u32 msr) \ +{ \ + int bit_nr; \ + \ + bit_nr = svm_msrpm_bit_nr(msr); \ + if (bit_nr < 0) \ + return (rtype)true; \ + \ + return bitop##_bit(bit_nr + bit_rw, bitmap); \ +} + +#define BUILD_SVM_MSR_BITMAP_HELPERS(ret_type, action, bitop) \ + __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0) \ + __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 1) + +BUILD_SVM_MSR_BITMAP_HELPERS(bool, test, test) +BUILD_SVM_MSR_BITMAP_HELPERS(void, clear, __clear) +BUILD_SVM_MSR_BITMAP_HELPERS(void, set, __set) #define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) +/* svm.c */ extern bool dump_invalid_vmcb; -u32 svm_msrpm_offset(u32 msr); -u32 *svm_vcpu_alloc_msrpm(void); -void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); -void svm_vcpu_free_msrpm(u32 *msrpm); +void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask); + +static inline void *svm_vcpu_alloc_msrpm(void) +{ + return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT); +} + +void svm_vcpu_free_msrpm(void *msrpm); void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); void svm_enable_lbrv(struct kvm_vcpu *vcpu); void svm_update_lbrv(struct kvm_vcpu *vcpu); @@ -638,6 +703,20 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); +void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); + +static inline void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type) +{ + svm_set_intercept_for_msr(vcpu, msr, type, false); +} + +static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type) +{ + svm_set_intercept_for_msr(vcpu, msr, type, true); +} + /* nested.c */ #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ @@ -666,6 +745,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } +int __init nested_svm_init_msrpm_merge_offsets(void); + int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); void svm_leave_nested(struct kvm_vcpu *vcpu); @@ -716,7 +797,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops; BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \ - BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \ + BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \ ) bool avic_hardware_setup(void); @@ -731,8 +813,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_vcpu *vcpu, u32 vector); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); @@ -747,6 +830,7 @@ void sev_init_vmcb(struct vcpu_svm *svm); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_vcpu_reset(struct vcpu_svm *svm); +void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); @@ -783,6 +867,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { @@ -814,6 +900,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return 0; } +static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + return NULL; +} +static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {} #endif /* vmenter.S */ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 0c61153b275f..235c4af6b692 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -169,6 +169,9 @@ SYM_FUNC_START(__svm_vcpu_run) #endif mov VCPU_RDI(%_ASM_DI), %_ASM_DI + /* Clobbers EFLAGS.ZF */ + VM_CLEAR_CPU_BUFFERS + /* Enter guest mode */ 3: vmrun %_ASM_AX 4: @@ -335,6 +338,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov SVM_current_vmcb(%rdi), %rax mov KVM_VMCB_pa(%rax), %rax + /* Clobbers EFLAGS.ZF */ + VM_CLEAR_CPU_BUFFERS + /* Enter guest mode */ 1: vmrun %rax 2: |