diff options
Diffstat (limited to 'arch/x86/kvm/svm/avic.c')
| -rw-r--r-- | arch/x86/kvm/svm/avic.c | 1265 |
1 files changed, 687 insertions, 578 deletions
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 6919dee69f18..6b77b2033208 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -12,14 +12,16 @@ * Avi Kivity <avi@qumranet.com> */ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/hashtable.h> #include <linux/amd-iommu.h> #include <linux/kvm_host.h> +#include <linux/kvm_irqfd.h> #include <asm/irq_remapping.h> +#include <asm/msr.h> #include "trace.h" #include "lapic.h" @@ -27,18 +29,68 @@ #include "irq.h" #include "svm.h" -/* AVIC GATAG is encoded using VM and VCPU IDs */ -#define AVIC_VCPU_ID_BITS 8 -#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) +/* + * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that + * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't + * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index + * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast + * lookup on the index, where as vCPUs whose index doesn't match their ID need + * to walk the entire xarray of vCPUs in the worst case scenario. + * + * For the vCPU index, use however many bits are currently allowed for the max + * guest physical APIC ID (limited by the size of the physical ID table), and + * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the + * size of the GATag is defined by hardware (32 bits), but is an opaque value + * as far as hardware is concerned. + */ +#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK + +#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) +#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) -#define AVIC_VM_ID_BITS 24 -#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) -#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) +#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) +#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) -#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ - (y & AVIC_VCPU_ID_MASK)) -#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) -#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ + ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) +#define AVIC_GATAG(vm_id, vcpu_idx) \ +({ \ + u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ + \ + WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ + WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ + ga_tag; \ +}) + +static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); + +#define AVIC_AUTO_MODE -1 + +static int avic_param_set(const char *val, const struct kernel_param *kp) +{ + if (val && sysfs_streq(val, "auto")) { + *(int *)kp->arg = AVIC_AUTO_MODE; + return 0; + } + + return param_set_bint(val, kp); +} + +static const struct kernel_param_ops avic_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = avic_param_set, + .get = param_get_bool, +}; + +/* + * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled + * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met). + */ +static int avic = AVIC_AUTO_MODE; +module_param_cb(avic, &avic_ops, &avic, 0444); +__MODULE_PARM_TYPE(avic, "bool"); + +module_param(enable_ipiv, bool, 0444); static bool force_avic; module_param_unsafe(force_avic, bool, 0444); @@ -53,43 +105,117 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); -enum avic_modes avic_mode; +static bool x2avic_enabled; +static u32 x2avic_max_physical_id; -/* - * This is a wrapper of struct amd_iommu_ir_data. - */ -struct amd_svm_iommu_ir { - struct list_head node; /* Used by SVM for per-vcpu ir_list */ - void *data; /* Storing pointer to struct amd_ir_data */ -}; +static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, + bool intercept) +{ + static const u32 x2avic_passthrough_msrs[] = { + X2APIC_MSR(APIC_ID), + X2APIC_MSR(APIC_LVR), + X2APIC_MSR(APIC_TASKPRI), + X2APIC_MSR(APIC_ARBPRI), + X2APIC_MSR(APIC_PROCPRI), + X2APIC_MSR(APIC_EOI), + X2APIC_MSR(APIC_RRR), + X2APIC_MSR(APIC_LDR), + X2APIC_MSR(APIC_DFR), + X2APIC_MSR(APIC_SPIV), + X2APIC_MSR(APIC_ISR), + X2APIC_MSR(APIC_TMR), + X2APIC_MSR(APIC_IRR), + X2APIC_MSR(APIC_ESR), + X2APIC_MSR(APIC_ICR), + X2APIC_MSR(APIC_ICR2), + + /* + * Note! Always intercept LVTT, as TSC-deadline timer mode + * isn't virtualized by hardware, and the CPU will generate a + * #GP instead of a #VMEXIT. + */ + X2APIC_MSR(APIC_LVTTHMR), + X2APIC_MSR(APIC_LVTPC), + X2APIC_MSR(APIC_LVT0), + X2APIC_MSR(APIC_LVT1), + X2APIC_MSR(APIC_LVTERR), + X2APIC_MSR(APIC_TMICT), + X2APIC_MSR(APIC_TMCCT), + X2APIC_MSR(APIC_TDCR), + }; + int i; + + if (intercept == svm->x2avic_msrs_intercepted) + return; + + if (!x2avic_enabled) + return; + + for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) + svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], + MSR_TYPE_RW, intercept); + + svm->x2avic_msrs_intercepted = intercept; +} + +static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu) +{ + u32 arch_max; + + /* + * Return the largest size (x2APIC) when querying without a vCPU, e.g. + * to allocate the per-VM table.. + */ + if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic))) + arch_max = x2avic_max_physical_id; + else + arch_max = AVIC_MAX_PHYSICAL_ID; + + /* + * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID + * plus one, so the max possible APIC ID is one less than that. + */ + return min(kvm->arch.max_vcpu_ids - 1, arch_max); +} + +static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) +{ + return __avic_get_max_physical_id(vcpu->kvm, vcpu); +} static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; + struct kvm_vcpu *vcpu = &svm->vcpu; vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu); vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - /* Note: - * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC - * MSR accesses, while interrupt injection to a running vCPU - * can be achieved using AVIC doorbell. The AVIC hardware still - * accelerate MMIO accesses, but this does not cause any harm - * as the guest is not supposed to access xAPIC mmio when uses x2APIC. + /* + * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR + * accesses, while interrupt injection to a running vCPU can be + * achieved using AVIC doorbell. KVM disables the APIC access page + * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling + * AVIC in hybrid mode activates only the doorbell mechanism. */ - if (apic_x2apic_mode(svm->vcpu.arch.apic) && - avic_mode == AVIC_MODE_X2) { + if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; - vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + /* Disabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, false); + avic_set_x2apic_msr_interception(svm, false); } else { - /* For xAVIC and hybrid-xAVIC modes */ - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; + /* + * Flush the TLB, the guest may have inserted a non-APIC + * mapping into the TLB while AVIC was disabled. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); + /* Enabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, true); + avic_set_x2apic_msr_interception(svm, true); } } @@ -109,29 +235,29 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm) return; /* Enabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, true); + avic_set_x2apic_msr_interception(svm, true); } /* Note: * This function is called from IOMMU driver to notify * SVM to schedule in a particular vCPU of a particular VM. */ -int avic_ga_log_notifier(u32 ga_tag) +static int avic_ga_log_notifier(u32 ga_tag) { unsigned long flags; struct kvm_svm *kvm_svm; struct kvm_vcpu *vcpu = NULL; u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); + u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); - pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); - trace_kvm_avic_ga_log(vm_id, vcpu_id); + pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); + trace_kvm_avic_ga_log(vm_id, vcpu_idx); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { if (kvm_svm->avic_vm_id != vm_id) continue; - vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); + vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); break; } spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); @@ -147,6 +273,30 @@ int avic_ga_log_notifier(u32 ga_tag) return 0; } +static int avic_get_physical_id_table_order(struct kvm *kvm) +{ + /* Provision for the maximum physical ID supported in x2avic mode */ + return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64)); +} + +int avic_alloc_physical_id_table(struct kvm *kvm) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_apicv) + return 0; + + if (kvm_svm->avic_physical_id_table) + return 0; + + kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + avic_get_physical_id_table_order(kvm)); + if (!kvm_svm->avic_physical_id_table) + return -ENOMEM; + + return 0; +} + void avic_vm_destroy(struct kvm *kvm) { unsigned long flags; @@ -155,10 +305,9 @@ void avic_vm_destroy(struct kvm *kvm) if (!enable_apicv) return; - if (kvm_svm->avic_logical_id_table_page) - __free_page(kvm_svm->avic_logical_id_table_page); - if (kvm_svm->avic_physical_id_table_page) - __free_page(kvm_svm->avic_physical_id_table_page); + free_page((unsigned long)kvm_svm->avic_logical_id_table); + free_pages((unsigned long)kvm_svm->avic_physical_id_table, + avic_get_physical_id_table_order(kvm)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_del(&kvm_svm->hnode); @@ -171,27 +320,15 @@ int avic_vm_init(struct kvm *kvm) int err = -ENOMEM; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); struct kvm_svm *k2; - struct page *p_page; - struct page *l_page; u32 vm_id; if (!enable_apicv) return 0; - /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!p_page) - goto free_avic; - - kvm_svm->avic_physical_id_table_page = p_page; - - /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!l_page) + kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!kvm_svm->avic_logical_id_table) goto free_avic; - kvm_svm->avic_logical_id_table_page = l_page; - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); again: vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; @@ -217,17 +354,19 @@ free_avic: return err; } +static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) +{ + return __sme_set(__pa(svm->vcpu.arch.apic->regs)); +} + void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) { struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); - phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); - phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); - phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); - vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; - vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; + vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); + vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); + vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; if (kvm_apicv_activated(svm->vcpu.kvm)) avic_activate_vmcb(svm); @@ -235,88 +374,62 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) avic_deactivate_vmcb(svm); } -static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, - unsigned int index) +static int avic_init_backing_page(struct kvm_vcpu *vcpu) { - u64 *avic_physical_id_table; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); + struct vcpu_svm *svm = to_svm(vcpu); + u32 id = vcpu->vcpu_id; + u64 new_entry; - if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) || - (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID)) - return NULL; - - avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); - - return &avic_physical_id_table[index]; -} - -/* - * Note: - * AVIC hardware walks the nested page table to check permissions, - * but does not use the SPA address specified in the leaf page - * table entry since it uses address in the AVIC_BACKING_PAGE pointer - * field of the VMCB. Therefore, we set up the - * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. - */ -static int avic_alloc_access_page(struct kvm *kvm) -{ - void __user *ret; - int r = 0; - - mutex_lock(&kvm->slots_lock); - - if (kvm->arch.apic_access_memslot_enabled) - goto out; - - ret = __x86_set_memory_region(kvm, - APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, - PAGE_SIZE); - if (IS_ERR(ret)) { - r = PTR_ERR(ret); - goto out; + /* + * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC + * hardware. Immediately clear apicv_active, i.e. don't wait until the + * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as + * avic_vcpu_load() expects to be called if and only if the vCPU has + * fully initialized AVIC. + */ + if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || + (id > x2avic_max_physical_id)) { + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); + vcpu->arch.apic->apicv_active = false; + return 0; } - kvm->arch.apic_access_memslot_enabled = true; -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static int avic_init_backing_page(struct kvm_vcpu *vcpu) -{ - u64 *entry, new_entry; - int id = vcpu->vcpu_id; - struct vcpu_svm *svm = to_svm(vcpu); - - if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) || - (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID)) - return -EINVAL; + BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || + (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); - if (!vcpu->arch.apic->regs) + if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) return -EINVAL; if (kvm_apicv_activated(vcpu->kvm)) { int ret; - ret = avic_alloc_access_page(vcpu->kvm); + /* + * Note, AVIC hardware walks the nested page table to check + * permissions, but does not use the SPA address specified in + * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE + * pointer field of the VMCB. + */ + ret = kvm_alloc_apic_access_page(vcpu->kvm); if (ret) return ret; } - svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); + /* Note, fls64() returns the bit position, +1. */ + BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > + fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); /* Setting AVIC backing page address in the phy APIC ID table */ - entry = avic_get_physical_id_entry(vcpu, id); - if (!entry) - return -EINVAL; - - new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); - WRITE_ONCE(*entry, new_entry); + new_entry = avic_get_backing_page_address(svm) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + svm->avic_physical_id_entry = new_entry; - svm->avic_physical_id_cache = entry; + /* + * Initialize the real table, as vCPUs must have a valid entry in order + * for broadcast IPIs to function correctly (broadcast IPIs ignore + * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). + */ + WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); return 0; } @@ -333,12 +446,66 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) int cpu = READ_ONCE(vcpu->cpu); if (cpu != get_cpu()) { - wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); } put_cpu(); } + +static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) +{ + vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); +} + +static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, + u32 icrl) +{ + /* + * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, + * i.e. APIC ID == vCPU ID. + */ + struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); + + /* Once again, nothing to do if the target vCPU doesn't exist. */ + if (unlikely(!target_vcpu)) + return; + + avic_kick_vcpu(target_vcpu, icrl); +} + +static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, + u32 logid_index, u32 icrl) +{ + u32 physical_id; + + if (avic_logical_id_table) { + u32 logid_entry = avic_logical_id_table[logid_index]; + + /* Nothing to do if the logical destination is invalid. */ + if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) + return; + + physical_id = logid_entry & + AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } else { + /* + * For x2APIC, the logical APIC ID is a read-only value that is + * derived from the x2APIC ID, thus the x2APIC ID can be found + * by reversing the calculation (stored in logid_index). Note, + * bits 31:20 of the x2APIC ID aren't propagated to the logical + * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. + */ + physical_id = logid_index; + } + + avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); +} + /* * A fast-path version of avic_kick_target_vcpus(), which attempts to match * destination APIC ID to vCPU without looping through all vCPUs. @@ -346,11 +513,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh, u32 index) { - u32 l1_physical_id, dest; - struct kvm_vcpu *target_vcpu; int dest_mode = icrl & APIC_DEST_MASK; int shorthand = icrl & APIC_SHORT_MASK; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + u32 dest; if (shorthand != APIC_DEST_NOSHORT) return -EINVAL; @@ -367,18 +533,18 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) return -EINVAL; - l1_physical_id = dest; - - if (WARN_ON_ONCE(l1_physical_id != index)) + if (WARN_ON_ONCE(dest != index)) return -EINVAL; + avic_kick_vcpu_by_physical_id(kvm, dest, icrl); } else { - u32 bitmap, cluster; - int logid_index; + u32 *avic_logical_id_table; + unsigned long bitmap, i; + u32 cluster; if (apic_x2apic_mode(source)) { /* 16 bit dest mask, 16 bit cluster id */ - bitmap = dest & 0xFFFF0000; + bitmap = dest & 0xFFFF; cluster = (dest >> 16) << 4; } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { /* 8 bit dest mask*/ @@ -390,67 +556,32 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source cluster = (dest >> 4) << 2; } + /* Nothing to do if there are no destinations in the cluster. */ if (unlikely(!bitmap)) - /* guest bug: nobody to send the logical interrupt to */ return 0; - if (!is_power_of_2(bitmap)) - /* multiple logical destinations, use slow path */ - return -EINVAL; - - logid_index = cluster + __ffs(bitmap); - - if (!apic_x2apic_mode(source)) { - u32 *avic_logical_id_table = - page_address(kvm_svm->avic_logical_id_table_page); - - u32 logid_entry = avic_logical_id_table[logid_index]; - - if (WARN_ON_ONCE(index != logid_index)) - return -EINVAL; - - /* guest bug: non existing/reserved logical destination */ - if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) - return 0; + if (apic_x2apic_mode(source)) + avic_logical_id_table = NULL; + else + avic_logical_id_table = kvm_svm->avic_logical_id_table; - l1_physical_id = logid_entry & - AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; - } else { - /* - * For x2APIC logical mode, cannot leverage the index. - * Instead, calculate physical ID from logical ID in ICRH. - */ - int cluster = (icrh & 0xffff0000) >> 16; - int apic = ffs(icrh & 0xffff) - 1; - - /* - * If the x2APIC logical ID sub-field (i.e. icrh[15:0]) - * contains anything but a single bit, we cannot use the - * fast path, because it is limited to a single vCPU. - */ - if (apic < 0 || icrh != (1 << apic)) - return -EINVAL; - - l1_physical_id = (cluster << 4) + apic; - } + /* + * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical + * IDs, thus each bit in the destination is guaranteed to map + * to at most one vCPU. + */ + for_each_set_bit(i, &bitmap, 16) + avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, + cluster + i, icrl); } - target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id); - if (unlikely(!target_vcpu)) - /* guest bug: non existing vCPU is a target of this IPI*/ - return 0; - - target_vcpu->arch.apic->irr_pending = true; - svm_complete_interrupt_delivery(target_vcpu, - icrl & APIC_MODE_MASK, - icrl & APIC_INT_LEVELTRIG, - icrl & APIC_VECTOR_MASK); return 0; } static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh, u32 index) { + u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); unsigned long i; struct kvm_vcpu *vcpu; @@ -466,21 +597,9 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, * since entered the guest will have processed pending IRQs at VMRUN. */ kvm_for_each_vcpu(i, vcpu, kvm) { - u32 dest; - - if (apic_x2apic_mode(vcpu->arch.apic)) - dest = icrh; - else - dest = GET_XAPIC_DEST_FIELD(icrh); - if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, - dest, icrl & APIC_DEST_MASK)) { - vcpu->arch.apic->irr_pending = true; - svm_complete_interrupt_delivery(vcpu, - icrl & APIC_MODE_MASK, - icrl & APIC_INT_LEVELTRIG, - icrl & APIC_VECTOR_MASK); - } + dest, icrl & APIC_DEST_MASK)) + avic_kick_vcpu(vcpu, icrl); } } @@ -490,20 +609,24 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; + u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK; struct kvm_lapic *apic = vcpu->arch.apic; trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); switch (id) { + case AVIC_IPI_FAILURE_INVALID_TARGET: case AVIC_IPI_FAILURE_INVALID_INT_TYPE: /* * Emulate IPIs that are not handled by AVIC hardware, which - * only virtualizes Fixed, Edge-Triggered INTRs. The exit is - * a trap, e.g. ICR holds the correct value and RIP has been - * advanced, KVM is responsible only for emulating the IPI. - * Sadly, hardware may sometimes leave the BUSY flag set, in - * which case KVM needs to emulate the ICR write as well in + * only virtualizes Fixed, Edge-Triggered INTRs, and falls over + * if _any_ targets are invalid, e.g. if the logical mode mask + * is a superset of running vCPUs. + * + * The exit is a trap, e.g. ICR holds the correct value and RIP + * has been advanced, KVM is responsible only for emulating the + * IPI. Sadly, hardware may sometimes leave the BUSY flag set, + * in which case KVM needs to emulate the ICR write as well in * order to clear the BUSY flag. */ if (icrl & APIC_ICR_BUSY) @@ -519,13 +642,14 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) */ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); break; - case AVIC_IPI_FAILURE_INVALID_TARGET: - break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); break; + case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: + /* Invalid IPI with vector < 16 */ + break; default: - pr_err("Unknown IPI interception\n"); + vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); } return 1; @@ -541,33 +665,30 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - int index; - u32 *logical_apic_id_table; - int dlid = GET_APIC_LOGICAL_ID(ldr); - - if (!dlid) - return NULL; + u32 cluster, index; - if (flat) { /* flat */ - index = ffs(dlid) - 1; - if (index > 7) - return NULL; - } else { /* cluster */ - int cluster = (dlid & 0xf0) >> 4; - int apic = ffs(dlid & 0x0f) - 1; + ldr = GET_APIC_LOGICAL_ID(ldr); - if ((apic < 0) || (apic > 7) || - (cluster >= 0xf)) + if (flat) { + cluster = 0; + } else { + cluster = (ldr >> 4); + if (cluster >= 0xf) return NULL; - index = (cluster << 2) + apic; + ldr &= 0xf; } + if (!ldr || !is_power_of_2(ldr)) + return NULL; - logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); + index = __ffs(ldr); + if (WARN_ON_ONCE(index > 7)) + return NULL; + index += (cluster << 2); - return &logical_apic_id_table[index]; + return &kvm_svm->avic_logical_id_table[index]; } -static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) +static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) { bool flat; u32 *entry, new_entry; @@ -575,15 +696,13 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; entry = avic_get_logical_id_entry(vcpu, ldr, flat); if (!entry) - return -EINVAL; + return; new_entry = READ_ONCE(*entry); new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(*entry, new_entry); - - return 0; } static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) @@ -601,29 +720,23 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } -static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) +static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) { - int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); u32 id = kvm_xapic_id(vcpu->arch.apic); /* AVIC does not support LDR update for x2APIC */ if (apic_x2apic_mode(vcpu->arch.apic)) - return 0; + return; if (ldr == svm->ldr_reg) - return 0; + return; avic_invalidate_logical_id_entry(vcpu); - if (ldr) - ret = avic_ldr_write(vcpu, id, ldr); - - if (!ret) - svm->ldr_reg = ldr; - - return ret; + svm->ldr_reg = ldr; + avic_ldr_write(vcpu, id, ldr); } static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) @@ -645,12 +758,14 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) switch (offset) { case APIC_LDR: - if (avic_handle_ldr_update(vcpu)) - return 0; + avic_handle_ldr_update(vcpu); break; case APIC_DFR: avic_handle_dfr_update(vcpu); break; + case APIC_RRR: + /* Ignore writes to Read Remote Data, it's read-only. */ + return 1; default: break; } @@ -719,6 +834,9 @@ int avic_init_vcpu(struct vcpu_svm *svm) int ret; struct kvm_vcpu *vcpu = &svm->vcpu; + INIT_LIST_HEAD(&svm->ir_list); + raw_spin_lock_init(&svm->ir_list_lock); + if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) return 0; @@ -726,8 +844,6 @@ int avic_init_vcpu(struct vcpu_svm *svm) if (ret) return ret; - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); svm->dfr_reg = APIC_DFR_FLAT; return ret; @@ -739,320 +855,201 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) { - if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) - return; + struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; + unsigned long flags; - if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) { - WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id); + if (!vcpu) return; - } - avic_refresh_apicv_exec_ctrl(vcpu); + + raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); + list_del(&irqfd->vcpu_list); + raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); } -static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_vcpu *vcpu, u32 vector) { - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - struct vcpu_svm *svm = to_svm(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. + * If the IRQ was affined to a different vCPU, remove the IRTE metadata + * from the *previous* vCPU's list. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); + svm_ir_list_del(irqfd); - if (list_empty(&svm->ir_list)) - goto out; + if (vcpu) { + /* + * Try to enable guest_mode in IRTE, unless AVIC is inhibited, + * in which case configure the IRTE for legacy mode, but track + * the IRTE metadata so that it can be converted to guest mode + * if AVIC is enabled/uninhibited in the future. + */ + struct amd_iommu_pi_data pi_data = { + .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, + vcpu->vcpu_idx), + .is_guest_mode = kvm_vcpu_apicv_active(vcpu), + .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), + .vector = vector, + }; + struct vcpu_svm *svm = to_svm(vcpu); + u64 entry; + int ret; - list_for_each_entry(ir, &svm->ir_list, node) { - if (activate) - ret = amd_iommu_activate_guest_mode(ir->data); - else - ret = amd_iommu_deactivate_guest_mode(ir->data); + /* + * Prevent the vCPU from being scheduled out or migrated until + * the IRTE is updated and its metadata has been added to the + * list of IRQs being posted to the vCPU, to ensure the IRTE + * isn't programmed with stale pCPU/IsRunning information. + */ + guard(raw_spinlock_irqsave)(&svm->ir_list_lock); + + /* + * Update the target pCPU for IOMMU doorbells if the vCPU is + * running. If the vCPU is NOT running, i.e. is blocking or + * scheduled out, KVM will update the pCPU info when the vCPU + * is awakened and/or scheduled in. See also avic_vcpu_load(). + */ + entry = svm->avic_physical_id_entry; + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { + pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + } else { + pi_data.cpu = -1; + pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; + } + + ret = irq_set_vcpu_affinity(host_irq, &pi_data); if (ret) - break; - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; -} + return ret; -static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - unsigned long flags; - struct amd_svm_iommu_ir *cur; + /* + * Revert to legacy mode if the IOMMU didn't provide metadata + * for the IRTE, which KVM needs to keep the IRTE up-to-date, + * e.g. if the vCPU is migrated or AVIC is disabled. + */ + if (WARN_ON_ONCE(!pi_data.ir_data)) { + irq_set_vcpu_affinity(host_irq, NULL); + return -EIO; + } - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, node) { - if (cur->data != pi->ir_data) - continue; - list_del(&cur->node); - kfree(cur); - break; + irqfd->irq_bypass_data = pi_data.ir_data; + list_add(&irqfd->vcpu_list, &svm->ir_list); + return 0; } - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + return irq_set_vcpu_affinity(host_irq, NULL); } -static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - - /** - * In some cases, the existing irte is updated and re-set, - * so we need to check here if it's already been * added - * to the ir_list. +enum avic_vcpu_action { + /* + * There is no need to differentiate between activate and deactivate, + * as KVM only refreshes AVIC state when the vCPU is scheduled in and + * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is + * being (de)activated. */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { - struct kvm *kvm = svm->vcpu.kvm; - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); - struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - struct vcpu_svm *prev_svm; - - if (!prev_vcpu) { - ret = -EINVAL; - goto out; - } - - prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, pi); - } + AVIC_TOGGLE_ON_OFF = BIT(0), + AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, + AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, - /** - * Allocating new amd_iommu_pi_data, which will get - * add to the per-vcpu ir_list. + /* + * No unique action is required to deal with a vCPU that stops/starts + * running. A vCPU that starts running by definition stops blocking as + * well, and a vCPU that stops running can't have been blocking, i.e. + * doesn't need to toggle GALogIntr. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); - if (!ir) { - ret = -ENOMEM; - goto out; - } - ir->data = pi->ir_data; + AVIC_START_RUNNING = 0, + AVIC_STOP_RUNNING = 0, - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_add(&ir->node, &svm->ir_list); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -out: - return ret; -} + /* + * When a vCPU starts blocking, KVM needs to set the GALogIntr flag + * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is + * sent to the vCPU. + */ + AVIC_START_BLOCKING = BIT(1), +}; -/* - * Note: - * The HW cannot support posting multicast/broadcast - * interrupts to a vCPU. So, we still use legacy interrupt - * remapping for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - */ -static int -get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct vcpu_data *vcpu_info, struct vcpu_svm **svm) +static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, + enum avic_vcpu_action action) { - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu = NULL; + bool ga_log_intr = (action & AVIC_START_BLOCKING); + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_kernel_irqfd *irqfd; - kvm_set_msi_irq(kvm, e, &irq); + lockdep_assert_held(&svm->ir_list_lock); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", - __func__, irq.vector); - return -1; - } + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + if (list_empty(&svm->ir_list)) + return; - pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, - irq.vector); - *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); - vcpu_info->vector = irq.vector; + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { + void *data = irqfd->irq_bypass_data; - return 0; + if (!(action & AVIC_TOGGLE_ON_OFF)) + WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); + else if (cpu >= 0) + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); + else + WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); + } } -/* - * avic_pi_update_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, + enum avic_vcpu_action action) { - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - int idx, ret = 0; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) - return 0; - - pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", - __func__, host_irq, guest_irq, set); - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - struct vcpu_data vcpu_info; - struct vcpu_svm *svm = NULL; - - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - - /** - * Here, we setup with legacy mode in the following cases: - * 1. When cannot target interrupt to a specific vcpu. - * 2. Unsetting posted interrupt. - * 3. APIC virtualization is disabled for the vcpu. - * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) - */ - if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && - kvm_vcpu_apicv_active(&svm->vcpu)) { - struct amd_iommu_pi_data pi; - - /* Try to enable guest_mode in IRTE */ - pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & - AVIC_HPA_MASK); - pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, - svm->vcpu.vcpu_id); - pi.is_guest_mode = true; - pi.vcpu_data = &vcpu_info; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Here, we successfully setting up vcpu affinity in - * IOMMU guest mode. Now, we need to store the posted - * interrupt information in a per-vcpu ir_list so that - * we can reference to them directly when we update vcpu - * scheduling information in IOMMU irte. - */ - if (!ret && pi.is_guest_mode) - svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } - } + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); + int h_physical_id = kvm_cpu_get_apicid(cpu); + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long flags; + u64 entry; - if (!ret && svm) { - trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, - e->gsi, vcpu_info.vector, - vcpu_info.pi_desc_addr, set); - } + lockdep_assert_preemption_disabled(); - if (ret < 0) { - pr_err("%s: failed to update PI IRTE\n", __func__); - goto out; - } - } + if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) + return; - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= + PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) + return; -bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) -{ - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_ABSENT) | - BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_NESTED) | - BIT(APICV_INHIBIT_REASON_IRQWIN) | - BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | - BIT(APICV_INHIBIT_REASON_SEV) | - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); - - return supported & BIT(reason); -} + /* + * Grab the per-vCPU interrupt remapping lock even if the VM doesn't + * _currently_ have assigned devices, as that can change. Holding + * ir_list_lock ensures that either svm_ir_list_add() will consume + * up-to-date entry information, or that this task will wait until + * svm_ir_list_add() completes to set the new target pCPU. + */ + raw_spin_lock_irqsave(&svm->ir_list_lock, flags); + entry = svm->avic_physical_id_entry; + WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); -static inline int -avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - struct vcpu_svm *svm = to_svm(vcpu); + entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | + AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); + entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; + svm->avic_physical_id_entry = entry; /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. + * If IPI virtualization is disabled, clear IsRunning when updating the + * actual Physical ID table, so that the CPU never sees IsRunning=1. + * Keep the APIC ID up-to-date in the entry to minimize the chances of + * things going sideways if hardware peeks at the ID. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); + if (!enable_ipiv) + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (list_empty(&svm->ir_list)) - goto out; + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); - list_for_each_entry(ir, &svm->ir_list, node) { - ret = amd_iommu_update_ga(cpu, r, ir->data); - if (ret) - break; - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); + + raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - u64 entry; - int h_physical_id = kvm_cpu_get_apicid(cpu); - struct vcpu_svm *svm = to_svm(vcpu); - - lockdep_assert_preemption_disabled(); - - if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) - return; - /* * No need to update anything if the vCPU is blocking, i.e. if the vCPU * is being scheduled in after being preempted. The CPU entries in the @@ -1063,46 +1060,99 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_vcpu_is_blocking(vcpu)) return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - - entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; - entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); + __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); } -void avic_vcpu_put(struct kvm_vcpu *vcpu) +static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) { - u64 entry; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); + unsigned long flags; + u64 entry = svm->avic_physical_id_entry; lockdep_assert_preemption_disabled(); - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - - /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ - if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= + PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) return; - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + /* + * Take and hold the per-vCPU interrupt remapping lock while updating + * the Physical ID entry even though the lock doesn't protect against + * multiple writers (see above). Holding ir_list_lock ensures that + * either svm_ir_list_add() will consume up-to-date entry information, + * or that this task will wait until svm_ir_list_add() completes to + * mark the vCPU as not running. + */ + raw_spin_lock_irqsave(&svm->ir_list_lock, flags); + + avic_update_iommu_vcpu_affinity(vcpu, -1, action); + WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); + + /* + * Keep the previous APIC ID in the entry so that a rogue doorbell from + * hardware is at least restricted to a CPU associated with the vCPU. + */ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + + if (enable_ipiv) + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); + + /* + * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as + * it's a synthetic flag that usurps an unused should-be-zero bit. + */ + if (action & AVIC_START_BLOCKING) + entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; + + svm->avic_physical_id_entry = entry; + + raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); } +void avic_vcpu_put(struct kvm_vcpu *vcpu) +{ + /* + * Note, reading the Physical ID entry outside of ir_list_lock is safe + * as only the pCPU that has loaded (or is loading) the vCPU is allowed + * to modify the entry, and preemption is disabled. I.e. the vCPU + * can't be scheduled out and thus avic_vcpu_{put,load}() can't run + * recursively. + */ + u64 entry = to_svm(vcpu)->avic_physical_id_entry; + + /* + * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the + * vCPU is preempted while its in the process of blocking. WARN if the + * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put + * the AVIC if it wasn't previously loaded. + */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { + if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) + return; -void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) + /* + * The vCPU was preempted while blocking, ensure its IRTEs are + * configured to generate GA Log Interrupts. + */ + if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) + return; + } + + __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : + AVIC_STOP_RUNNING); +} + +void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; - bool activated = kvm_vcpu_apicv_active(vcpu); - if (!enable_apicv) + if (!lapic_in_kernel(vcpu) || !enable_apicv) return; - if (activated) { + if (kvm_vcpu_apicv_active(vcpu)) { /** * During AVIC temporary deactivation, guest could update * APIC ID, DFR and LDR registers, which would not be trapped @@ -1116,13 +1166,22 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) avic_deactivate_vmcb(svm); } vmcb_mark_dirty(vmcb, VMCB_AVIC); +} - if (activated) - avic_vcpu_load(vcpu, vcpu->cpu); - else - avic_vcpu_put(vcpu); +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + if (!enable_apicv) + return; - avic_set_pi_irte_mode(vcpu, activated); + /* APICv should only be toggled on/off while the vCPU is running. */ + WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); + + avic_refresh_virtual_apic_mode(vcpu); + + if (kvm_vcpu_apicv_active(vcpu)) + __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); + else + __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); } void avic_vcpu_blocking(struct kvm_vcpu *vcpu) @@ -1130,20 +1189,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu) if (!kvm_vcpu_apicv_active(vcpu)) return; - /* - * Unload the AVIC when the vCPU is about to block, _before_ - * the vCPU actually blocks. - * - * Any IRQs that arrive before IsRunning=0 will not cause an - * incomplete IPI vmexit on the source, therefore vIRR will also - * be checked by kvm_vcpu_check_block() before blocking. The - * memory barrier implicit in set_current_state orders writing - * IsRunning=0 before reading the vIRR. The processor needs a - * matching memory barrier on interrupt delivery between writing - * IRR and reading IsRunning; the lack of this barrier might be - * the cause of errata #1235). - */ - avic_vcpu_put(vcpu); + /* + * Unload the AVIC when the vCPU is about to block, _before_ the vCPU + * actually blocks. + * + * Note, any IRQs that arrive before IsRunning=0 will not cause an + * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles + * this by checking vIRR one last time before blocking. The memory + * barrier implicit in set_current_state orders writing IsRunning=0 + * before reading the vIRR. The processor needs a matching memory + * barrier on interrupt delivery between writing IRR and reading + * IsRunning; the lack of this barrier might be the cause of errata #1235). + * + * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM + * doesn't need to detect events for scheduling purposes. The doorbell + * used to signal running vCPUs cannot be blocked, i.e. will perturb the + * CPU and cause noisy neighbor problems if the VM is sending interrupts + * to the vCPU while it's scheduled out. + */ + __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); } void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) @@ -1154,43 +1218,88 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_vcpu_load(vcpu, vcpu->cpu); } +static bool __init avic_want_avic_enabled(void) +{ + /* + * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is + * supported (to avoid enabling partial support by default, and because + * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for + * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags + * aren't inclusive of previous generations, i.e. the kernel will set + * at most one ZenX feature flag. + */ + if (avic == AVIC_AUTO_MODE) + avic = boot_cpu_has(X86_FEATURE_X2AVIC) && + (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4)); + + if (!avic || !npt_enabled) + return false; + + /* AVIC is a prerequisite for x2AVIC. */ + if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { + if (boot_cpu_has(X86_FEATURE_X2AVIC)) + pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n"); + return false; + } + + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && + !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { + pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); + return false; + } + + /* + * Print a scary message if AVIC is force enabled to make it abundantly + * clear that ignoring CPUID could have repercussions. See Revision + * Guide for specific AMD processor for more details. + */ + if (!boot_cpu_has(X86_FEATURE_AVIC)) + pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); + + return true; +} + /* * Note: * - The module param avic enable both xAPIC and x2APIC mode. * - Hypervisor can support both xAVIC and x2AVIC in the same guest. * - The mode can be switched at run-time. */ -bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +bool __init avic_hardware_setup(void) { - if (!npt_enabled) + avic = avic_want_avic_enabled(); + if (!avic) return false; - if (boot_cpu_has(X86_FEATURE_AVIC)) { - avic_mode = AVIC_MODE_X1; - pr_info("AVIC enabled\n"); - } else if (force_avic) { - /* - * Some older systems does not advertise AVIC support. - * See Revision Guide for specific AMD processor for more detail. - */ - avic_mode = AVIC_MODE_X1; - pr_warn("AVIC is not supported in CPUID but force enabled"); - pr_warn("Your system might crash and burn"); - } + pr_info("AVIC enabled\n"); /* AVIC is a prerequisite for x2AVIC. */ - if (boot_cpu_has(X86_FEATURE_X2AVIC)) { - if (avic_mode == AVIC_MODE_X1) { - avic_mode = AVIC_MODE_X2; - pr_info("x2AVIC enabled\n"); - } else { - pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); - pr_warn(FW_BUG "Try enable AVIC using force_avic option"); - } + x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); + if (x2avic_enabled) { + if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT)) + x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID; + else + x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID; + pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1); + } else { + svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; } - if (avic_mode != AVIC_MODE_NONE) - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + /* + * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) + * due to erratum 1235, which results in missed VM-Exits on the sender + * and thus missed wake events for blocking vCPUs due to the CPU + * failing to see a software update to clear IsRunning. + */ + enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; + + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - return !!avic_mode; + return true; +} + +void avic_hardware_unsetup(void) +{ + if (avic) + amd_iommu_register_ga_log_notifier(NULL); } |
