diff options
Diffstat (limited to 'arch/x86/kvm/hyperv.c')
| -rw-r--r-- | arch/x86/kvm/hyperv.c | 563 |
1 files changed, 454 insertions, 109 deletions
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0adf4a437e85..de92292eb1f5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -17,28 +17,50 @@ * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" #include "ioapic.h" #include "cpuid.h" #include "hyperv.h" +#include "mmu.h" #include "xen.h" #include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> +#include <linux/spinlock.h> #include <linux/eventfd.h> #include <asm/apicdef.h> +#include <asm/mshyperv.h> #include <trace/events/kvm.h> #include "trace.h" #include "irq.h" #include "fpu.h" -#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) +#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) + +/* + * As per Hyper-V TLFS, extended hypercalls start from 0x8001 + * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value + * where each bit tells which extended hypercall is available besides + * HvExtCallQueryCapabilities. + * + * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit + * assigned. + * + * 0x8002 - Bit 0 + * 0x8003 - Bit 1 + * .. + * 0x8041 - Bit 63 + * + * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64 + */ +#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); @@ -475,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return ret; } -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) +int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vpidx); + if (!level) + return -1; + + synic = synic_get(kvm, e->hv_sint.vcpu); if (!synic) return -EINVAL; - return synic_set_irq(synic, sint); + return synic_set_irq(synic, e->hv_sint.sint); } void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) @@ -705,10 +731,12 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, stimer_cleanup(stimer); stimer->count = count; - if (stimer->count == 0) - stimer->config.enable = 0; - else if (stimer->config.auto_enable) - stimer->config.enable = 1; + if (!host) { + if (stimer->count == 0) + stimer->config.enable = 0; + else if (stimer->config.auto_enable) + stimer->config.enable = 1; + } if (stimer->config.enable) stimer_mark_pending(stimer, false); @@ -895,17 +923,19 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } -EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_assist_page_enabled); -bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, - struct hv_vp_assist_page *assist_page) +int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { - if (!kvm_hv_assist_page_enabled(vcpu)) - return false; - return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, - assist_page, sizeof(*assist_page)); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu)) + return -EFAULT; + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, + &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } -EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_get_assist_page); static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { @@ -926,8 +956,7 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; - hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - stimer->timer.function = stimer_timer_callback; + hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } @@ -954,6 +983,11 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) hv_vcpu->vp_index = vcpu->vcpu_idx; + for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) { + INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries); + spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock); + } + return 0; } @@ -989,6 +1023,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; @@ -1133,15 +1168,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); - mutex_lock(&hv->hv_lock); + guard(mutex)(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_SET || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) - goto out_unlock; + return; if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - goto out_unlock; + return; gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* @@ -1157,7 +1192,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; - goto out_unlock; + return; } /* @@ -1193,12 +1228,10 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; - goto out_unlock; + return; out_err: hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; -out_unlock: - mutex_unlock(&hv->hv_lock); } void kvm_hv_request_tsc_page_update(struct kvm *kvm) @@ -1263,7 +1296,6 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_VP_ASSIST_PAGE: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; - break; case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & @@ -1273,6 +1305,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_TSC_INVARIANT; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & @@ -1288,6 +1323,56 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) return false; } +#define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839 +#define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */ + +/* + * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC + * configuration. + * Such configuration can result from, for example, AMD Erratum 1386 workaround. + * + * Print a notice so users aren't left wondering what's suddenly gone wrong. + */ +static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = to_kvm_hv(kvm); + + /* Check again under the hv_lock. */ + if (hv->xsaves_xsavec_checked) + return; + + if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) != + KVM_HV_WIN2016_GUEST_ID) + return; + + hv->xsaves_xsavec_checked = true; + + /* UP configurations aren't affected */ + if (atomic_read(&kvm->online_vcpus) < 2) + return; + + if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) || + !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC)) + return; + + pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. " + "If it fails to boot try disabling XSAVEC in the VM config.\n"); +} + +void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) +{ + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (!vcpu->arch.hyperv_enabled || + hv->xsaves_xsavec_checked) + return; + + mutex_lock(&hv->hv_lock); + __kvm_hv_xsaves_xsavec_maybe_warn(vcpu); + mutex_unlock(&hv->hv_lock); +} + static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { @@ -1333,7 +1418,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, } /* vmcall/vmmcall */ - static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i); + kvm_x86_call(patch_hypercall)(vcpu, instructions + i); i += 3; /* ret */ @@ -1400,12 +1485,22 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + /* Only bit 0 is supported */ + if (data & ~HV_EXPOSE_INVARIANT_TSC) + return 1; + + /* The feature can't be disabled from the guest */ + if (!host && hv->hv_invtsc_control && !data) + return 1; + + hv->hv_invtsc_control = data; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; @@ -1473,7 +1568,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ - if (__put_user(0, (u32 __user *)addr)) + if (put_user(0, (u32 __user *)addr)) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); @@ -1526,8 +1621,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } @@ -1575,11 +1669,14 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + data = hv->hv_invtsc_control; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } @@ -1641,10 +1738,11 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = (u64)vcpu->arch.virtual_tsc_khz * 1000; break; case HV_X64_MSR_APIC_FREQUENCY: - data = APIC_BUS_FREQUENCY; + data = div64_u64(1000000000ULL, + vcpu->kvm->arch.apic_bus_cycle_ns); break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; @@ -1736,7 +1834,30 @@ static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, } } +static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[]) +{ + int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK; + unsigned long sbank; + + if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask)) + return false; + + /* + * The index into the sparse bank is the number of preceding bits in + * the valid mask. Optimize for VMs with <64 vCPUs by skipping the + * fancy math if there can't possibly be preceding bits. + */ + if (valid_bit_nr) + sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0)); + else + sbank = 0; + + return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK, + (unsigned long *)&sparse_banks[sbank]); +} + struct kvm_hv_hcall { + /* Hypercall input data */ u64 param; u64 ingpa; u64 outgpa; @@ -1747,59 +1868,182 @@ struct kvm_hv_hcall { bool fast; bool rep; sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; -}; -static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, - int consumed_xmm_halves, - u64 *sparse_banks, gpa_t offset) -{ - u16 var_cnt; - int i; + /* + * Current read offset when KVM reads hypercall input data gradually, + * either offset in bytes from 'ingpa' for regular hypercalls or the + * number of already consumed 'XMM halves' for 'fast' hypercalls. + */ + union { + gpa_t data_offset; + int consumed_xmm_halves; + }; +}; - if (hc->var_cnt > 64) - return -EINVAL; - /* Ignore banks that cannot possibly contain a legal VP index. */ - var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS); +static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, + u16 orig_cnt, u16 cnt_cap, u64 *data) +{ + /* + * Preserve the original count when ignoring entries via a "cap", KVM + * still needs to validate the guest input (though the non-XMM path + * punts on the checks). + */ + u16 cnt = min(orig_cnt, cnt_cap); + int i, j; if (hc->fast) { /* * Each XMM holds two sparse banks, but do not count halves that * have already been consumed for hypercall parameters. */ - if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves) + if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves) return HV_STATUS_INVALID_HYPERCALL_INPUT; - for (i = 0; i < var_cnt; i++) { - int j = i + consumed_xmm_halves; + + for (i = 0; i < cnt; i++) { + j = i + hc->consumed_xmm_halves; if (j % 2) - sparse_banks[i] = sse128_hi(hc->xmm[j / 2]); + data[i] = sse128_hi(hc->xmm[j / 2]); else - sparse_banks[i] = sse128_lo(hc->xmm[j / 2]); + data[i] = sse128_lo(hc->xmm[j / 2]); } return 0; } - return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks, - var_cnt * sizeof(*sparse_banks)); + return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data, + cnt * sizeof(*data)); +} + +static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, + u64 *sparse_banks) +{ + if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS) + return -EINVAL; + + /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ + return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS, + sparse_banks); +} + +static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[]) +{ + return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries); +} + +static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo, + u64 *entries, int count) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY; + + if (!hv_vcpu) + return; + + spin_lock(&tlb_flush_fifo->write_lock); + + /* + * All entries should fit on the fifo leaving one free for 'flush all' + * entry in case another request comes in. In case there's not enough + * space, just put 'flush all' entry there. + */ + if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) { + WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count); + goto out_unlock; + } + + /* + * Note: full fifo always contains 'flush all' entry, no need to check the + * return value. + */ + kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1); + +out_unlock: + spin_unlock(&tlb_flush_fifo->write_lock); +} + +int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE]; + int i, j, count; + gva_t gva; + + if (!tdp_enabled || !hv_vcpu) + return -EINVAL; + + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); + + count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); + + for (i = 0; i < count; i++) { + if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) + goto out_flush_all; + + if (is_noncanonical_invlpg_address(entries[i], vcpu)) + continue; + + /* + * Lower 12 bits of 'address' encode the number of additional + * pages to flush. + */ + gva = entries[i] & PAGE_MASK; + for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) + kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); + + ++vcpu->stat.tlb_flush; + } + return 0; + +out_flush_all: + kfifo_reset_out(&tlb_flush_fifo->entries); + + /* Fall back to full flush. */ + return -ENOSPC; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + unsigned long *vcpu_mask = hv_vcpu->vcpu_mask; + u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; - DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + /* + * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' + * entries on the TLB flush fifo. The last entry, however, needs to be + * always left free for 'flush all' entry which gets placed when + * there is not enough space to put all the requested entries. + */ + u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1]; + u64 *tlb_flush_entries; u64 valid_bank_mask; - u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + struct kvm_vcpu *v; + unsigned long i; bool all_cpus; /* - * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the - * valid mask is a u64. Fail the build if KVM's max allowed number of - * vCPUs (>4096) would exceed this limit, KVM will additional changes - * for Hyper-V support to avoid setting the guest up to fail. + * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS + * sparse banks. Fail the build if KVM's max allowed number of + * vCPUs (>4096) exceeds this limit. + */ + BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS); + + /* + * 'Slow' hypercall's first parameter is the address in guest's memory + * where hypercall parameters are placed. This is either a GPA or a + * nested GPA when KVM is handling the call from L2 ('direct' TLB + * flush). Translate the address here so the memory can be uniformly + * read with kvm_read_guest(). */ - BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64); + if (!hc->fast && is_guest_mode(vcpu)) { + hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); + if (unlikely(hc->ingpa == INVALID_GPA)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) { @@ -1807,14 +2051,17 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) flush.address_space = hc->ingpa; flush.flags = hc->outgpa; flush.processor_mask = sse128_lo(hc->xmm[0]); + hc->consumed_xmm_halves = 1; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; + hc->data_offset = sizeof(flush); } trace_kvm_hv_flush_tlb(flush.processor_mask, - flush.address_space, flush.flags); + flush.address_space, flush.flags, + is_guest_mode(vcpu)); valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; @@ -1834,16 +2081,18 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) flush_ex.flags = hc->outgpa; memcpy(&flush_ex.hv_vp_set, &hc->xmm[0], sizeof(hc->xmm[0])); + hc->consumed_xmm_halves = 2; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, sizeof(flush_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; + hc->data_offset = sizeof(flush_ex); } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, flush_ex.address_space, - flush_ex.flags); + flush_ex.flags, is_guest_mode(vcpu)); valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; all_cpus = flush_ex.hv_vp_set.format != @@ -1852,29 +2101,95 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - if (all_cpus) - goto do_flush; + if (!all_cpus) { + if (!hc->var_cnt) + goto ret_success; - if (!hc->var_cnt) - goto ret_success; + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + + /* + * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU + * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs' + * case (HV_GENERIC_SET_ALL). Always adjust data_offset and + * consumed_xmm_halves to make sure TLB flush entries are read + * from the correct offset. + */ + if (hc->fast) + hc->consumed_xmm_halves += hc->var_cnt; + else + hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]); + } - if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks, - offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents))) + if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || + hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || + hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) { + tlb_flush_entries = NULL; + } else { + if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries)) return HV_STATUS_INVALID_HYPERCALL_INPUT; + tlb_flush_entries = __tlb_flush_entries; } -do_flush: /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - if (all_cpus) { - kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); - } else { + if (all_cpus && !is_guest_mode(vcpu)) { + kvm_for_each_vcpu(i, v, kvm) { + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); + } else if (!is_guest_mode(vcpu)) { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask); + for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { + v = kvm_get_vcpu(kvm, i); + if (!v) + continue; + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); + } else { + struct kvm_vcpu_hv *hv_v; + + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); + + kvm_for_each_vcpu(i, v, kvm) { + hv_v = to_hv_vcpu(v); + + /* + * The following check races with nested vCPUs entering/exiting + * and/or migrating between L1's vCPUs, however the only case when + * KVM *must* flush the TLB is when the target L2 vCPU keeps + * running on the same L1 vCPU from the moment of the request until + * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other + * cases, e.g. when the target L2 vCPU migrates to a different L1 + * vCPU or when the corresponding L1 vCPU temporary switches to a + * different L2 vCPU while the request is being processed. + */ + if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id) + continue; + + if (!all_cpus && + !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask, + sparse_banks)) + continue; + + __set_bit(i, vcpu_mask); + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } ret_success: @@ -1883,8 +2198,8 @@ ret_success: ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } -static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, - unsigned long *vcpu_bitmap) +static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, + u64 *sparse_banks, u64 valid_bank_mask) { struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, @@ -1894,7 +2209,9 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) + if (sparse_banks && + !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu), + valid_bank_mask, sparse_banks)) continue; /* We fail only when APIC is disabled */ @@ -1904,15 +2221,18 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; - DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); u64 valid_bank_mask; - u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, @@ -1959,9 +2279,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) if (!hc->var_cnt) goto ret_success; - if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks, - offsetof(struct hv_send_ipi_ex, - vp_set.bank_contents))) + if (!hc->fast) + hc->data_offset = offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents); + else + hc->consumed_xmm_halves = 1; + + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } @@ -1969,13 +2293,10 @@ check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - if (all_cpus) { - kvm_send_ipi_to_many(kvm, vector, NULL); - } else { - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - - kvm_send_ipi_to_many(kvm, vector, vcpu_mask); - } + if (all_cpus) + kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0); + else + kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask); ret_success: return HV_STATUS_SUCCESS; @@ -2062,10 +2383,25 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { + u32 tlb_lock_count = 0; + int ret; + + if (hv_result_success(result) && is_guest_mode(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu) && + kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa, + &tlb_lock_count, sizeof(tlb_lock_count))) + result = HV_STATUS_INVALID_HYPERCALL_INPUT; + trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; - return kvm_skip_emulated_instruction(vcpu); + + ret = kvm_skip_emulated_instruction(vcpu); + + if (tlb_lock_count) + kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu); + + return ret; } static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) @@ -2110,7 +2446,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *h if (!eventfd) return HV_STATUS_INVALID_PORT_ID; - eventfd_signal(eventfd, 1); + eventfd_signal(eventfd); return HV_STATUS_SUCCESS; } @@ -2178,6 +2514,9 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) case HVCALL_SEND_IPI: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_CLUSTER_IPI_RECOMMENDED; + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + return hv_vcpu->cpuid_cache.features_ebx & + HV_ENABLE_EXTENDED_HYPERCALLS; default: break; } @@ -2195,7 +2534,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ - if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { + if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -2270,14 +2609,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = hc.param; - vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; - vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; + goto hypercall_userspace_exit; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -2336,15 +2668,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = HV_STATUS_OPERATION_DENIED; break; } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = hc.param; - vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; - vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; + goto hypercall_userspace_exit; } + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + if (unlikely(hc.fast)) { + ret = HV_STATUS_INVALID_PARAMETER; + break; + } + goto hypercall_userspace_exit; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; @@ -2352,6 +2683,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); + +hypercall_userspace_exit: + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; + vcpu->run->hyperv.u.hcall.input = hc.param; + vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; + vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; + vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; + return 0; } void kvm_hv_init_vm(struct kvm *kvm) @@ -2491,9 +2831,11 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; ent->eax |= HV_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_ACCESS_REENLIGHTENMENT; + ent->eax |= HV_ACCESS_TSC_INVARIANT; ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; + ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; @@ -2502,6 +2844,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->ebx |= HV_DEBUGGING; ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH; /* * Direct Synthetic timers only make sense with in-kernel @@ -2516,7 +2859,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; @@ -2545,6 +2889,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; + ent->eax |= HV_X64_NESTED_DIRECT_FLUSH; ent->eax |= HV_X64_NESTED_MSR_BITMAP; ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; break; |
