diff options
Diffstat (limited to 'arch/x86/kvm/hyperv.c')
| -rw-r--r-- | arch/x86/kvm/hyperv.c | 1756 |
1 files changed, 1395 insertions, 361 deletions
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c90a5352d158..de92292eb1f5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * KVM Microsoft Hyper-V emulation * @@ -15,28 +16,51 @@ * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" #include "ioapic.h" +#include "cpuid.h" #include "hyperv.h" +#include "mmu.h" +#include "xen.h" +#include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> +#include <linux/spinlock.h> #include <linux/eventfd.h> #include <asm/apicdef.h> +#include <asm/mshyperv.h> #include <trace/events/kvm.h> #include "trace.h" +#include "irq.h" +#include "fpu.h" -#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) +#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) + +/* + * As per Hyper-V TLFS, extended hypercalls start from 0x8001 + * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value + * where each bit tells which extended hypercall is available besides + * HvExtCallQueryCapabilities. + * + * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit + * assigned. + * + * 0x8002 - Bit 0 + * 0x8003 - Bit 1 + * .. + * 0x8041 - Bit 63 + * + * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64 + */ +#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); @@ -83,6 +107,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + bool auto_eoi_old, auto_eoi_new; + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; @@ -91,10 +119,37 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, else __clear_bit(vector, synic->vec_bitmap); + auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256); + if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); + + auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256); + + if (auto_eoi_old == auto_eoi_new) + return; + + if (!enable_apicv) + return; + + down_write(&vcpu->kvm->arch.apicv_update_lock); + + if (auto_eoi_new) + hv->synic_auto_eoi_used++; + else + hv->synic_auto_eoi_used--; + + /* + * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on + * the hypervisor to manually inject IRQs. + */ + __kvm_set_or_clear_apicv_inhibit(vcpu->kvm, + APICV_INHIBIT_REASON_HYPERV, + !!hv->synic_auto_eoi_used); + + up_write(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, @@ -128,23 +183,23 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, synic_update_vector(synic, vector); /* Load SynIC vectors into EOI exit bitmap */ - kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic)); + kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic)); return 0; } static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; if (vpidx >= KVM_MAX_VCPUS) return NULL; vcpu = kvm_get_vcpu(kvm, vpidx); - if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + if (kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; return NULL; } @@ -155,17 +210,17 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) struct kvm_vcpu_hv_synic *synic; vcpu = get_vcpu_by_vpidx(kvm, vpidx); - if (!vcpu) + if (!vcpu || !to_hv_vcpu(vcpu)) return NULL; - synic = vcpu_to_synic(vcpu); + synic = to_hv_synic(vcpu); return (synic->active) ? synic : NULL; } static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) { struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; int gsi, idx; @@ -189,8 +244,8 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; hv_vcpu->exit.u.synic.msr = msr; @@ -204,10 +259,10 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 data, bool host) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int ret; - if (!synic->active && !host) + if (!synic->active && (!host || data)) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); @@ -253,6 +308,9 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, case HV_X64_MSR_EOM: { int i; + if (!synic->active) + break; + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) kvm_hv_notify_acked_sint(vcpu, i); break; @@ -267,6 +325,115 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } +static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + return hv_vcpu->cpuid_cache.syndbg_cap_eax & + HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; +} + +static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) + hv->hv_syndbg.control.status = + vcpu->run->hyperv.u.syndbg.status; + return 1; +} + +static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; + hv_vcpu->exit.u.syndbg.msr = msr; + hv_vcpu->exit.u.syndbg.control = syndbg->control.control; + hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; + hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; + hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; + vcpu->arch.complete_userspace_io = + kvm_hv_syndbg_complete_userspace; + + kvm_make_request(KVM_REQ_HV_EXIT, vcpu); +} + +static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) +{ + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, + to_hv_vcpu(vcpu)->vp_index, msr, data); + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + syndbg->control.control = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_STATUS: + syndbg->control.status = data; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + syndbg->control.send_page = data; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + syndbg->control.recv_page = data; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + syndbg->control.pending_page = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + syndbg->options = data; + break; + default: + break; + } + + return 0; +} + +static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) +{ + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + *pdata = syndbg->control.control; + break; + case HV_X64_MSR_SYNDBG_STATUS: + *pdata = syndbg->control.status; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + *pdata = syndbg->control.send_page; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + *pdata = syndbg->control.recv_page; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + *pdata = syndbg->control.pending_page; + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + *pdata = syndbg->options; + break; + default: + break; + } + + trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata); + + return 0; +} + static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { @@ -304,10 +471,13 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_lapic_irq irq; int ret, vector; + if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return -EINVAL; + if (sint >= ARRAY_SIZE(synic->sint)) return -EINVAL; @@ -327,20 +497,24 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return ret; } -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) +int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vpidx); + if (!level) + return -1; + + synic = synic_get(kvm, e->hv_sint.vcpu); if (!synic) return -EINVAL; - return synic_set_irq(synic, sint); + return synic_set_irq(synic, e->hv_sint.sint); } void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) { - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); int i; trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); @@ -397,15 +571,15 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic) static u64 get_time_ref_counter(struct kvm *kvm) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; u64 tsc; /* - * The guest has not set up the TSC page or the clock isn't - * stable, fall back to get_kvmclock_ns. + * Fall back to get_kvmclock_ns() when TSC page hasn't been set up, + * is broken, disabled or being updated. */ - if (!hv->tsc_ref.tsc_sequence) + if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET) return div_u64(get_kvmclock_ns(kvm), 100); vcpu = kvm_get_vcpu(kvm, 0); @@ -417,10 +591,10 @@ static u64 get_time_ref_counter(struct kvm *kvm) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); set_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + to_hv_vcpu(vcpu)->stimer_pending_bitmap); kvm_make_request(KVM_REQ_HV_STIMER, vcpu); if (vcpu_kick) kvm_vcpu_kick(vcpu); @@ -428,14 +602,14 @@ static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); - trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); hrtimer_cancel(&stimer->timer); clear_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + to_hv_vcpu(vcpu)->stimer_pending_bitmap); stimer->msg_pending = false; stimer->exp_time = 0; } @@ -445,7 +619,7 @@ static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) struct kvm_vcpu_hv_stimer *stimer; stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); - trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); stimer_mark_pending(stimer, true); @@ -462,7 +636,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) u64 time_now; ktime_t ktime_now; - time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); + time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm); ktime_now = ktime_get(); if (stimer->config.periodic) { @@ -479,7 +653,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) stimer->exp_time = time_now + stimer->count; trace_kvm_hv_stimer_start_periodic( - stimer_to_vcpu(stimer)->vcpu_id, + hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->exp_time); @@ -501,7 +675,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) return 0; } - trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->count); @@ -516,8 +690,19 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); - trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, + if (!synic->active && (!host || config)) + return 1; + + if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode && + !(hv_vcpu->cpuid_cache.features_edx & + HV_STIMER_DIRECT_MODE_AVAILABLE))) + return 1; + + trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); @@ -526,23 +711,36 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, new_config.enable = 0; stimer->config.as_uint64 = new_config.as_uint64; - stimer_mark_pending(stimer, false); + if (stimer->config.enable) + stimer_mark_pending(stimer, false); + return 0; } static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { - trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); + + if (!synic->active && (!host || count)) + return 1; + + trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); stimer_cleanup(stimer); stimer->count = count; - if (stimer->count == 0) - stimer->config.enable = 0; - else if (stimer->config.auto_enable) - stimer->config.enable = 1; - stimer_mark_pending(stimer, false); + if (!host) { + if (stimer->count == 0) + stimer->config.enable = 0; + else if (stimer->config.auto_enable) + stimer->config.enable = 1; + } + + if (stimer->config.enable) + stimer_mark_pending(stimer, false); + return 0; } @@ -561,7 +759,7 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, struct hv_message *src_msg, bool no_retry) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int msg_off = offsetof(struct hv_message_page, sint_message[sint]); gfn_t msg_page_gfn; struct hv_message_header hv_hdr; @@ -617,7 +815,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; @@ -630,20 +828,22 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); - return synic_deliver_msg(vcpu_to_synic(vcpu), + return synic_deliver_msg(to_hv_synic(vcpu), stimer->config.sintx, msg, no_retry); } static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = stimer->config.apic_vector }; - return !kvm_apic_set_irq(vcpu, &irq, NULL); + if (lapic_in_kernel(vcpu)) + return !kvm_apic_set_irq(vcpu, &irq, NULL); + return 0; } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) @@ -655,7 +855,7 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) r = stimer_send_msg(stimer); else r = stimer_notify_direct(stimer); - trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, direct, r); if (!r) { stimer->msg_pending = false; @@ -666,11 +866,14 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; u64 time_now, exp_time; int i; + if (!hv_vcpu) + return; + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { stimer = &hv_vcpu->stimer[i]; @@ -696,30 +899,43 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; + if (!hv_vcpu) + return; + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_cleanup(&hv_vcpu->stimer[i]); + + kfree(hv_vcpu); + vcpu->arch.hyperv = NULL; } bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return false; + + if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } -EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_assist_page_enabled); -bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, - struct hv_vp_assist_page *assist_page) +int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { - if (!kvm_hv_assist_page_enabled(vcpu)) - return false; - return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, - assist_page, sizeof(*assist_page)); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu)) + return -EFAULT; + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, + &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } -EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_get_assist_page); static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { @@ -740,41 +956,55 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; - hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - stimer->timer.function = stimer_timer_callback; + hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } -void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) +int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; + if (hv_vcpu) + return 0; + + hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT); + if (!hv_vcpu) + return -ENOMEM; + + vcpu->arch.hyperv = hv_vcpu; + hv_vcpu->vcpu = vcpu; + synic_init(&hv_vcpu->synic); bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); -} -void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + hv_vcpu->vp_index = vcpu->vcpu_idx; - hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); + for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) { + INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries); + spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock); + } + + return 0; } int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu_hv_synic *synic; + int r; + + r = kvm_hv_vcpu_init(vcpu); + if (r) + return r; + + synic = to_hv_synic(vcpu); - /* - * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so deactivate APICV - */ - kvm_vcpu_deactivate_apicv(vcpu); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; + synic->control = HV_SYNIC_CONTROL_ENABLE; return 0; } @@ -793,6 +1023,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } @@ -800,58 +1033,44 @@ static bool kvm_hv_msr_partition_wide(u32 msr) return r; } -static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 *pdata) +static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - *pdata = hv->hv_crash_param[index]; + *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; return 0; } -static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata) +static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); *pdata = hv->hv_crash_ctl; return 0; } -static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) +static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - if (host) - hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; - - if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) { - - vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", - hv->hv_crash_param[0], - hv->hv_crash_param[1], - hv->hv_crash_param[2], - hv->hv_crash_param[3], - hv->hv_crash_param[4]); + struct kvm_hv *hv = to_kvm_hv(kvm); - /* Send notification about crash to user space */ - kvm_make_request(KVM_REQ_HV_CRASH, vcpu); - } + hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; return 0; } -static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 data) +static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - hv->hv_crash_param[index] = data; + hv->hv_crash_param[array_index_nospec(index, size)] = data; return 0; } @@ -891,7 +1110,7 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, * These two equivalencies are implemented in this function. */ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, - HV_REFERENCE_TSC_PAGE *tsc_ref) + struct ms_hyperv_tsc_page *tsc_ref) { u64 max_mul; @@ -924,22 +1143,40 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, return true; } +/* + * Don't touch TSC page values if the guest has opted for TSC emulation after + * migration. KVM doesn't fully support reenlightenment notifications and TSC + * access emulation and Hyper-V is known to expect the values in TSC page to + * stay constant before TSC access emulation is disabled from guest side + * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC + * frequency and guest visible TSC value across migration (and prevent it when + * TSC scaling is unsupported). + */ +static inline bool tsc_page_update_unsafe(struct kvm_hv *hv) +{ + return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) && + hv->hv_tsc_emulation_control; +} + void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); u32 tsc_seq; u64 gfn; BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); - BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); + BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); - if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + guard(mutex)(&hv->hv_lock); + + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_SET || + hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) return; - mutex_lock(&kvm->arch.hyperv.hv_lock); if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - goto out_unlock; + return; gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* @@ -948,7 +1185,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) - goto out_unlock; + goto out_err; + + if (tsc_seq && tsc_page_update_unsafe(hv)) { + if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; + return; + } /* * While we're computing and writing the parameters, force the @@ -957,15 +1202,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - goto out_unlock; + goto out_err; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - goto out_unlock; + goto out_err; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - goto out_unlock; + goto out_err; /* * Now switch to the TSC page mechanism by writing the sequence. @@ -978,17 +1223,164 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, smp_wmb(); hv->tsc_ref.tsc_sequence = tsc_seq; - kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); -out_unlock: - mutex_unlock(&kvm->arch.hyperv.hv_lock); + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; + return; + +out_err: + hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; +} + +void kvm_hv_request_tsc_page_update(struct kvm *kvm) +{ + struct kvm_hv *hv = to_kvm_hv(kvm); + + mutex_lock(&hv->hv_lock); + + if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET && + !tsc_page_update_unsafe(hv)) + hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; + + mutex_unlock(&hv->hv_lock); +} + +static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) +{ + if (!hv_vcpu->enforce_cpuid) + return true; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_HYPERCALL_AVAILABLE; + case HV_X64_MSR_VP_RUNTIME: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_VP_RUNTIME_AVAILABLE; + case HV_X64_MSR_TIME_REF_COUNT: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_TIME_REF_COUNT_AVAILABLE; + case HV_X64_MSR_VP_INDEX: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_VP_INDEX_AVAILABLE; + case HV_X64_MSR_RESET: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_RESET_AVAILABLE; + case HV_X64_MSR_REFERENCE_TSC: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_REFERENCE_TSC_AVAILABLE; + case HV_X64_MSR_SCONTROL: + case HV_X64_MSR_SVERSION: + case HV_X64_MSR_SIEFP: + case HV_X64_MSR_SIMP: + case HV_X64_MSR_EOM: + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_SYNIC_AVAILABLE; + case HV_X64_MSR_STIMER0_CONFIG: + case HV_X64_MSR_STIMER1_CONFIG: + case HV_X64_MSR_STIMER2_CONFIG: + case HV_X64_MSR_STIMER3_CONFIG: + case HV_X64_MSR_STIMER0_COUNT: + case HV_X64_MSR_STIMER1_COUNT: + case HV_X64_MSR_STIMER2_COUNT: + case HV_X64_MSR_STIMER3_COUNT: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_SYNTIMER_AVAILABLE; + case HV_X64_MSR_EOI: + case HV_X64_MSR_ICR: + case HV_X64_MSR_TPR: + case HV_X64_MSR_VP_ASSIST_PAGE: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_APIC_ACCESS_AVAILABLE; + case HV_X64_MSR_TSC_FREQUENCY: + case HV_X64_MSR_APIC_FREQUENCY: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_FREQUENCY_MSRS; + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + case HV_X64_MSR_TSC_EMULATION_CONTROL: + case HV_X64_MSR_TSC_EMULATION_STATUS: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_REENLIGHTENMENT; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_TSC_INVARIANT; + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + return hv_vcpu->cpuid_cache.features_edx & + HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return hv_vcpu->cpuid_cache.features_edx & + HV_FEATURE_DEBUG_MSRS_AVAILABLE; + default: + break; + } + + return false; +} + +#define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839 +#define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */ + +/* + * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC + * configuration. + * Such configuration can result from, for example, AMD Erratum 1386 workaround. + * + * Print a notice so users aren't left wondering what's suddenly gone wrong. + */ +static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = to_kvm_hv(kvm); + + /* Check again under the hv_lock. */ + if (hv->xsaves_xsavec_checked) + return; + + if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) != + KVM_HV_WIN2016_GUEST_ID) + return; + + hv->xsaves_xsavec_checked = true; + + /* UP configurations aren't affected */ + if (atomic_read(&kvm->online_vcpus) < 2) + return; + + if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) || + !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC)) + return; + + pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. " + "If it fails to boot try disabling XSAVEC in the VM config.\n"); +} + +void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) +{ + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (!vcpu->arch.hyperv_enabled || + hv->xsaves_xsavec_checked) + return; + + mutex_lock(&hv->hv_lock); + __kvm_hv_xsaves_xsavec_maybe_warn(vcpu); + mutex_unlock(&hv->hv_lock); } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); + + if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) + return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: @@ -998,9 +1390,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; break; case HV_X64_MSR_HYPERCALL: { - u64 gfn; - unsigned long addr; - u8 instructions[4]; + u8 instructions[9]; + int i = 0; + u64 addr; /* if guest os id is not set hypercall should remain disabled */ if (!hv->hv_guest_os_id) @@ -1009,29 +1401,67 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_hypercall = data; break; } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return 1; - kvm_x86_ops->patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (__copy_to_user((void __user *)addr, instructions, 4)) + + /* + * If Xen and Hyper-V hypercalls are both enabled, disambiguate + * the same way Xen itself does, by setting the bit 31 of EAX + * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just + * going to be clobbered on 64-bit. + */ + if (kvm_xen_hypercall_enabled(kvm)) { + /* orl $0x80000000, %eax */ + instructions[i++] = 0x0d; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x80; + } + + /* vmcall/vmmcall */ + kvm_x86_call(patch_hypercall)(vcpu, instructions + i); + i += 3; + + /* ret */ + ((unsigned char *)instructions)[i++] = 0xc3; + + addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; + if (kvm_vcpu_write_guest(vcpu, addr, instructions, i)) return 1; hv->hv_hypercall = data; - mark_page_dirty(kvm, gfn); break; } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; - if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) + if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { + if (!host) + hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED; + else + hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + } else { + hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET; + } break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_set_crash_data(vcpu, + return kvm_hv_msr_set_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, data); case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + if (host) + return kvm_hv_msr_set_crash_ctl(kvm, data); + + if (data & HV_CRASH_CTL_CRASH_NOTIFY) { + vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", + hv->hv_crash_param[0], + hv->hv_crash_param[1], + hv->hv_crash_param[2], + hv->hv_crash_param[3], + hv->hv_crash_param[4]); + + /* Send notification about crash to user space */ + kvm_make_request(KVM_REQ_HV_CRASH, vcpu); + } + break; case HV_X64_MSR_RESET: if (data == 1) { vcpu_debug(vcpu, "hyper-v reset requested\n"); @@ -1045,6 +1475,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_tsc_emulation_control = data; break; case HV_X64_MSR_TSC_EMULATION_STATUS: + if (data && !host) + return 1; + hv->hv_tsc_emulation_status = data; break; case HV_X64_MSR_TIME_REF_COUNT: @@ -1052,9 +1485,22 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + /* Only bit 0 is supported */ + if (data & ~HV_EXPOSE_INVARIANT_TSC) + return 1; + + /* The feature can't be disabled from the guest */ + if (!host && hv->hv_invtsc_control && !data) + return 1; + + hv->hv_invtsc_control = data; + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_set_msr(vcpu, msr, data, host); default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; @@ -1072,12 +1518,14 @@ static u64 current_task_runtime_100ns(void) static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) + return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - int vcpu_idx = kvm_vcpu_get_idx(vcpu); + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); u32 new_vp_index = (u32)data; if (!host || new_vp_index >= KVM_MAX_VCPUS) @@ -1092,9 +1540,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) * VP index is changing, adjust num_mismatched_vp_indexes if * it now matches or no longer matches vcpu_idx. */ - if (hv_vcpu->vp_index == vcpu_idx) + if (hv_vcpu->vp_index == vcpu->vcpu_idx) atomic_inc(&hv->num_mismatched_vp_indexes); - else if (new_vp_index == vcpu_idx) + else if (new_vp_index == vcpu->vcpu_idx) atomic_dec(&hv->num_mismatched_vp_indexes); hv_vcpu->vp_index = new_vp_index; @@ -1106,7 +1554,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv_vcpu->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0)) + if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) return 1; break; } @@ -1116,15 +1564,15 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; /* - * Clear apic_assist portion of f(struct hv_vp_assist_page + * Clear apic_assist portion of struct hv_vp_assist_page * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ - if (__clear_user((void __user *)addr, sizeof(u32))) + if (put_user(0, (u32 __user *)addr)) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, + if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, sizeof(struct hv_vp_assist_page))) return 1; @@ -1147,14 +1595,14 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host); + return synic_set_msr(to_hv_synic(vcpu), msr, data, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - return stimer_set_config(vcpu_to_stimer(vcpu, timer_index), + return stimer_set_config(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_STIMER0_COUNT: @@ -1163,7 +1611,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), + return stimer_set_count(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_TSC_FREQUENCY: @@ -1173,19 +1621,22 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; } -static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); + + if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) + return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: @@ -1201,11 +1652,11 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = hv->hv_tsc_page; break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_get_crash_data(vcpu, + return kvm_hv_msr_get_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, pdata); case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + return kvm_hv_msr_get_crash_ctl(kvm, pdata); case HV_X64_MSR_RESET: data = 0; break; @@ -1218,8 +1669,14 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + data = hv->hv_invtsc_control; + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_get_msr(vcpu, msr, pdata, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } @@ -1231,7 +1688,10 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) + return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: @@ -1255,14 +1715,14 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host); + return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - return stimer_get_config(vcpu_to_stimer(vcpu, timer_index), + return stimer_get_config(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_STIMER0_COUNT: @@ -1271,17 +1731,18 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), + return stimer_get_count(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_TSC_FREQUENCY: data = (u64)vcpu->arch.virtual_tsc_khz * 1000; break; case HV_X64_MSR_APIC_FREQUENCY: - data = APIC_BUS_FREQUENCY; + data = div64_u64(1000000000ULL, + vcpu->kvm->arch.apic_bus_cycle_ns); break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; @@ -1290,12 +1751,20 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (!host && !vcpu->arch.hyperv_enabled) + return 1; + + if (kvm_hv_vcpu_init(vcpu)) + return 1; + if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_lock(&hv->hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); - mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); @@ -1303,132 +1772,446 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (!host && !vcpu->arch.hyperv_enabled) + return 1; + + if (kvm_hv_vcpu_init(vcpu)) + return 1; + if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); - r = kvm_hv_get_msr_pw(vcpu, msr, pdata); - mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_lock(&hv->hv_lock); + r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); + mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata, host); } -static __always_inline unsigned long *sparse_set_to_vcpu_mask( - struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, - u64 *vp_bitmap, unsigned long *vcpu_bitmap) +static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, + u64 valid_bank_mask, unsigned long *vcpu_mask) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); + bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes); + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *vcpu; - int i, bank, sbank = 0; + int bank, sbank = 0; + unsigned long i; + u64 *bitmap; + + BUILD_BUG_ON(sizeof(vp_bitmap) > + sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS)); - memset(vp_bitmap, 0, - KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); + /* + * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else + * fill a temporary buffer and manually test each vCPU's VP index. + */ + if (likely(!has_mismatch)) + bitmap = (u64 *)vcpu_mask; + else + bitmap = vp_bitmap; + + /* + * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask + * having a '1' for each bank that exists in sparse_banks. Sets must + * be in ascending order, i.e. bank0..bankN. + */ + memset(bitmap, 0, sizeof(vp_bitmap)); for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, KVM_HV_MAX_SPARSE_VCPU_SET_BITS) - vp_bitmap[bank] = sparse_banks[sbank++]; + bitmap[bank] = sparse_banks[sbank++]; - if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) { - /* for all vcpus vp_index == vcpu_idx */ - return (unsigned long *)vp_bitmap; - } + if (likely(!has_mismatch)) + return; - bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { - if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index, - (unsigned long *)vp_bitmap)) - __set_bit(i, vcpu_bitmap); + if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) + __set_bit(i, vcpu_mask); + } +} + +static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[]) +{ + int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK; + unsigned long sbank; + + if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask)) + return false; + + /* + * The index into the sparse bank is the number of preceding bits in + * the valid mask. Optimize for VMs with <64 vCPUs by skipping the + * fancy math if there can't possibly be preceding bits. + */ + if (valid_bit_nr) + sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0)); + else + sbank = 0; + + return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK, + (unsigned long *)&sparse_banks[sbank]); +} + +struct kvm_hv_hcall { + /* Hypercall input data */ + u64 param; + u64 ingpa; + u64 outgpa; + u16 code; + u16 var_cnt; + u16 rep_cnt; + u16 rep_idx; + bool fast; + bool rep; + sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; + + /* + * Current read offset when KVM reads hypercall input data gradually, + * either offset in bytes from 'ingpa' for regular hypercalls or the + * number of already consumed 'XMM halves' for 'fast' hypercalls. + */ + union { + gpa_t data_offset; + int consumed_xmm_halves; + }; +}; + + +static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, + u16 orig_cnt, u16 cnt_cap, u64 *data) +{ + /* + * Preserve the original count when ignoring entries via a "cap", KVM + * still needs to validate the guest input (though the non-XMM path + * punts on the checks). + */ + u16 cnt = min(orig_cnt, cnt_cap); + int i, j; + + if (hc->fast) { + /* + * Each XMM holds two sparse banks, but do not count halves that + * have already been consumed for hypercall parameters. + */ + if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + for (i = 0; i < cnt; i++) { + j = i + hc->consumed_xmm_halves; + if (j % 2) + data[i] = sse128_hi(hc->xmm[j / 2]); + else + data[i] = sse128_lo(hc->xmm[j / 2]); + } + return 0; + } + + return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data, + cnt * sizeof(*data)); +} + +static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, + u64 *sparse_banks) +{ + if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS) + return -EINVAL; + + /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ + return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS, + sparse_banks); +} + +static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[]) +{ + return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries); +} + +static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo, + u64 *entries, int count) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY; + + if (!hv_vcpu) + return; + + spin_lock(&tlb_flush_fifo->write_lock); + + /* + * All entries should fit on the fifo leaving one free for 'flush all' + * entry in case another request comes in. In case there's not enough + * space, just put 'flush all' entry there. + */ + if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) { + WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count); + goto out_unlock; + } + + /* + * Note: full fifo always contains 'flush all' entry, no need to check the + * return value. + */ + kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1); + +out_unlock: + spin_unlock(&tlb_flush_fifo->write_lock); +} + +int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE]; + int i, j, count; + gva_t gva; + + if (!tdp_enabled || !hv_vcpu) + return -EINVAL; + + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); + + count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); + + for (i = 0; i < count; i++) { + if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) + goto out_flush_all; + + if (is_noncanonical_invlpg_address(entries[i], vcpu)) + continue; + + /* + * Lower 12 bits of 'address' encode the number of additional + * pages to flush. + */ + gva = entries[i] & PAGE_MASK; + for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) + kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); + + ++vcpu->stat.tlb_flush; } - return vcpu_bitmap; + return 0; + +out_flush_all: + kfifo_reset_out(&tlb_flush_fifo->entries); + + /* Fall back to full flush. */ + return -ENOSPC; } -static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, - u16 rep_cnt, bool ex) +static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { - struct kvm *kvm = current_vcpu->kvm; - struct kvm_vcpu_hv *hv_vcpu = ¤t_vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + unsigned long *vcpu_mask = hv_vcpu->vcpu_mask; + u64 *sparse_banks = hv_vcpu->sparse_banks; + struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; - u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; - DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); - unsigned long *vcpu_mask; + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + /* + * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' + * entries on the TLB flush fifo. The last entry, however, needs to be + * always left free for 'flush all' entry which gets placed when + * there is not enough space to put all the requested entries. + */ + u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1]; + u64 *tlb_flush_entries; u64 valid_bank_mask; - u64 sparse_banks[64]; - int sparse_banks_len; + struct kvm_vcpu *v; + unsigned long i; bool all_cpus; - if (!ex) { - if (unlikely(kvm_read_guest(kvm, ingpa, &flush, sizeof(flush)))) + /* + * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS + * sparse banks. Fail the build if KVM's max allowed number of + * vCPUs (>4096) exceeds this limit. + */ + BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS); + + /* + * 'Slow' hypercall's first parameter is the address in guest's memory + * where hypercall parameters are placed. This is either a GPA or a + * nested GPA when KVM is handling the call from L2 ('direct' TLB + * flush). Translate the address here so the memory can be uniformly + * read with kvm_read_guest(). + */ + if (!hc->fast && is_guest_mode(vcpu)) { + hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); + if (unlikely(hc->ingpa == INVALID_GPA)) return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + + if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || + hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) { + if (hc->fast) { + flush.address_space = hc->ingpa; + flush.flags = hc->outgpa; + flush.processor_mask = sse128_lo(hc->xmm[0]); + hc->consumed_xmm_halves = 1; + } else { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, + &flush, sizeof(flush)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + hc->data_offset = sizeof(flush); + } trace_kvm_hv_flush_tlb(flush.processor_mask, - flush.address_space, flush.flags); + flush.address_space, flush.flags, + is_guest_mode(vcpu)); valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; - all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; + + /* + * Work around possible WS2012 bug: it sends hypercalls + * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, + * while also expecting us to flush something and crashing if + * we don't. Let's treat processor_mask == 0 same as + * HV_FLUSH_ALL_PROCESSORS. + */ + all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || + flush.processor_mask == 0; } else { - if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex, - sizeof(flush_ex)))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->fast) { + flush_ex.address_space = hc->ingpa; + flush_ex.flags = hc->outgpa; + memcpy(&flush_ex.hv_vp_set, + &hc->xmm[0], sizeof(hc->xmm[0])); + hc->consumed_xmm_halves = 2; + } else { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, + sizeof(flush_ex)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + hc->data_offset = sizeof(flush_ex); + } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, flush_ex.address_space, - flush_ex.flags); + flush_ex.flags, is_guest_mode(vcpu)); valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - sparse_banks_len = - bitmap_weight((unsigned long *)&valid_bank_mask, 64) * - sizeof(sparse_banks[0]); + if (hc->var_cnt != hweight64(valid_bank_mask)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; - if (!sparse_banks_len && !all_cpus) - goto ret_success; + if (!all_cpus) { + if (!hc->var_cnt) + goto ret_success; - if (!all_cpus && - kvm_read_guest(kvm, - ingpa + offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents), - sparse_banks, - sparse_banks_len)) - return HV_STATUS_INVALID_HYPERCALL_INPUT; - } + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } - cpumask_clear(&hv_vcpu->tlb_flush); + /* + * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU + * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs' + * case (HV_GENERIC_SET_ALL). Always adjust data_offset and + * consumed_xmm_halves to make sure TLB flush entries are read + * from the correct offset. + */ + if (hc->fast) + hc->consumed_xmm_halves += hc->var_cnt; + else + hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]); + } - vcpu_mask = all_cpus ? NULL : - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); + if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || + hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || + hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) { + tlb_flush_entries = NULL; + } else { + if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + tlb_flush_entries = __tlb_flush_entries; + } /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, - KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, - vcpu_mask, &hv_vcpu->tlb_flush); + if (all_cpus && !is_guest_mode(vcpu)) { + kvm_for_each_vcpu(i, v, kvm) { + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); + } else if (!is_guest_mode(vcpu)) { + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); + + for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { + v = kvm_get_vcpu(kvm, i); + if (!v) + continue; + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); + } else { + struct kvm_vcpu_hv *hv_v; + + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); + + kvm_for_each_vcpu(i, v, kvm) { + hv_v = to_hv_vcpu(v); + + /* + * The following check races with nested vCPUs entering/exiting + * and/or migrating between L1's vCPUs, however the only case when + * KVM *must* flush the TLB is when the target L2 vCPU keeps + * running on the same L1 vCPU from the moment of the request until + * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other + * cases, e.g. when the target L2 vCPU migrates to a different L1 + * vCPU or when the corresponding L1 vCPU temporary switches to a + * different L2 vCPU while the request is being processed. + */ + if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id) + continue; + + if (!all_cpus && + !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask, + sparse_banks)) + continue; + + __set_bit(i, vcpu_mask); + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); + } ret_success: - /* We always do full TLB flush, set rep_done = rep_cnt. */ + /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ return (u64)HV_STATUS_SUCCESS | - ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); + ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } -static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, - unsigned long *vcpu_bitmap) +static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, + u64 *sparse_banks, u64 valid_bank_mask) { struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = vector }; struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) + if (sparse_banks && + !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu), + valid_bank_mask, sparse_banks)) continue; /* We fail only when APIC is disabled */ @@ -1436,43 +2219,48 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, } } -static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa, - bool ex, bool fast) +static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { - struct kvm *kvm = current_vcpu->kvm; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 *sparse_banks = hv_vcpu->sparse_banks; + struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; - u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; - DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); - unsigned long *vcpu_mask; - unsigned long valid_bank_mask; - u64 sparse_banks[64]; - int sparse_banks_len; + u64 valid_bank_mask; u32 vector; bool all_cpus; - if (!ex) { - if (!fast) { - if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi, + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + if (hc->code == HVCALL_SEND_IPI) { + if (!hc->fast) { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, sizeof(send_ipi)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = send_ipi.cpu_mask; vector = send_ipi.vector; } else { /* 'reserved' part of hv_send_ipi should be 0 */ - if (unlikely(ingpa >> 32 != 0)) + if (unlikely(hc->ingpa >> 32 != 0)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - sparse_banks[0] = outgpa; - vector = (u32)ingpa; + sparse_banks[0] = hc->outgpa; + vector = (u32)hc->ingpa; } all_cpus = false; valid_bank_mask = BIT_ULL(0); trace_kvm_hv_send_ipi(vector, sparse_banks[0]); } else { - if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex, - sizeof(send_ipi_ex)))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (!hc->fast) { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, + sizeof(send_ipi_ex)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } else { + send_ipi_ex.vector = (u32)hc->ingpa; + send_ipi_ex.vp_set.format = hc->outgpa; + send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]); + } trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, send_ipi_ex.vp_set.format, @@ -1480,59 +2268,140 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa, vector = send_ipi_ex.vector; valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; - sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * - sizeof(sparse_banks[0]); - all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; - if (!sparse_banks_len) + if (hc->var_cnt != hweight64(valid_bank_mask)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + if (all_cpus) + goto check_and_send_ipi; + + if (!hc->var_cnt) goto ret_success; - if (!all_cpus && - kvm_read_guest(kvm, - ingpa + offsetof(struct hv_send_ipi_ex, - vp_set.bank_contents), - sparse_banks, - sparse_banks_len)) + if (!hc->fast) + hc->data_offset = offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents); + else + hc->consumed_xmm_halves = 1; + + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } +check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - vcpu_mask = all_cpus ? NULL : - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); - - kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + if (all_cpus) + kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0); + else + kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask); ret_success: return HV_STATUS_SUCCESS; } -bool kvm_hv_hypercall_enabled(struct kvm *kvm) +void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) { - return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct kvm_cpuid_entry2 *entry; + + vcpu->arch.hyperv_enabled = hyperv_enabled; + + if (!hv_vcpu) { + /* + * KVM should have already allocated kvm_vcpu_hv if Hyper-V is + * enabled in CPUID. + */ + WARN_ON_ONCE(vcpu->arch.hyperv_enabled); + return; + } + + memset(&hv_vcpu->cpuid_cache, 0, sizeof(hv_vcpu->cpuid_cache)); + + if (!vcpu->arch.hyperv_enabled) + return; + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); + if (entry) { + hv_vcpu->cpuid_cache.features_eax = entry->eax; + hv_vcpu->cpuid_cache.features_ebx = entry->ebx; + hv_vcpu->cpuid_cache.features_edx = entry->edx; + } + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); + if (entry) { + hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; + hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; + } + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); + if (entry) + hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_NESTED_FEATURES); + if (entry) { + hv_vcpu->cpuid_cache.nested_eax = entry->eax; + hv_vcpu->cpuid_cache.nested_ebx = entry->ebx; + } +} + +int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) +{ + struct kvm_vcpu_hv *hv_vcpu; + int ret = 0; + + if (!to_hv_vcpu(vcpu)) { + if (enforce) { + ret = kvm_hv_vcpu_init(vcpu); + if (ret) + return ret; + } else { + return 0; + } + } + + hv_vcpu = to_hv_vcpu(vcpu); + hv_vcpu->enforce_cpuid = enforce; + + return ret; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; - longmode = is_64_bit_mode(vcpu); + longmode = is_64_bit_hypercall(vcpu); if (longmode) - kvm_register_write(vcpu, VCPU_REGS_RAX, result); + kvm_rax_write(vcpu, result); else { - kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff); + kvm_rdx_write(vcpu, result >> 32); + kvm_rax_write(vcpu, result & 0xffffffff); } } static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { + u32 tlb_lock_count = 0; + int ret; + + if (hv_result_success(result) && is_guest_mode(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu) && + kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa, + &tlb_lock_count, sizeof(tlb_lock_count))) + result = HV_STATUS_INVALID_HYPERCALL_INPUT; + + trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; - return kvm_skip_emulated_instruction(vcpu); + + ret = kvm_skip_emulated_instruction(vcpu); + + if (tlb_lock_count) + kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu); + + return ret; } static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) @@ -1540,19 +2409,21 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); } -static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) +static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); struct eventfd_ctx *eventfd; - if (unlikely(!fast)) { + if (unlikely(!hc->fast)) { int ret; - gpa_t gpa = param; + gpa_t gpa = hc->ingpa; - if ((gpa & (__alignof__(param) - 1)) || - offset_in_page(gpa) + sizeof(param) > PAGE_SIZE) + if ((gpa & (__alignof__(hc->ingpa) - 1)) || + offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE) return HV_STATUS_INVALID_ALIGNMENT; - ret = kvm_vcpu_read_guest(vcpu, gpa, ¶m, sizeof(param)); + ret = kvm_vcpu_read_guest(vcpu, gpa, + &hc->ingpa, sizeof(hc->ingpa)); if (ret < 0) return HV_STATUS_INVALID_ALIGNMENT; } @@ -1562,164 +2433,289 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) * have no use for it, and in all known usecases it is zero, so just * report lookup failure if it isn't. */ - if (param & 0xffff00000000ULL) + if (hc->ingpa & 0xffff00000000ULL) return HV_STATUS_INVALID_PORT_ID; /* remaining bits are reserved-zero */ - if (param & ~KVM_HYPERV_CONN_ID_MASK) + if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK) return HV_STATUS_INVALID_HYPERCALL_INPUT; /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ rcu_read_lock(); - eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param); + eventfd = idr_find(&hv->conn_to_evt, hc->ingpa); rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; - eventfd_signal(eventfd, 1); + eventfd_signal(eventfd); return HV_STATUS_SUCCESS; } +static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc) +{ + switch (hc->code) { + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: + case HVCALL_SEND_IPI_EX: + return true; + } + + return false; +} + +static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) +{ + int reg; + + kvm_fpu_get(); + for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) + _kvm_read_sse_reg(reg, &hc->xmm[reg]); + kvm_fpu_put(); +} + +static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) +{ + if (!hv_vcpu->enforce_cpuid) + return true; + + switch (code) { + case HVCALL_NOTIFY_LONG_SPIN_WAIT: + return hv_vcpu->cpuid_cache.enlightenments_ebx && + hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX; + case HVCALL_POST_MESSAGE: + return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES; + case HVCALL_SIGNAL_EVENT: + return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS; + case HVCALL_POST_DEBUG_DATA: + case HVCALL_RETRIEVE_DEBUG_DATA: + case HVCALL_RESET_DEBUG_SESSION: + /* + * Return 'true' when SynDBG is disabled so the resulting code + * will be HV_STATUS_INVALID_HYPERCALL_CODE. + */ + return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) || + hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: + if (!(hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return false; + fallthrough; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: + return hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + case HVCALL_SEND_IPI_EX: + if (!(hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return false; + fallthrough; + case HVCALL_SEND_IPI: + return hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_CLUSTER_IPI_RECOMMENDED; + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + return hv_vcpu->cpuid_cache.features_ebx & + HV_ENABLE_EXTENDED_HYPERCALLS; + default: + break; + } + + return true; +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { - u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS; - uint16_t code, rep_idx, rep_cnt; - bool fast, longmode, rep; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct kvm_hv_hcall hc; + u64 ret = HV_STATUS_SUCCESS; /* * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - longmode = is_64_bit_mode(vcpu); +#ifdef CONFIG_X86_64 + if (is_64_bit_hypercall(vcpu)) { + hc.param = kvm_rcx_read(vcpu); + hc.ingpa = kvm_rdx_read(vcpu); + hc.outgpa = kvm_r8_read(vcpu); + } else +#endif + { + hc.param = ((u64)kvm_rdx_read(vcpu) << 32) | + (kvm_rax_read(vcpu) & 0xffffffff); + hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | + (kvm_rcx_read(vcpu) & 0xffffffff); + hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | + (kvm_rsi_read(vcpu) & 0xffffffff); + } + + hc.code = hc.param & 0xffff; + hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET; + hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); + hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; + hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; + hc.rep = !!(hc.rep_cnt || hc.rep_idx); - if (!longmode) { - param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); - ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); - outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt, + hc.rep_idx, hc.ingpa, hc.outgpa); + + if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { + ret = HV_STATUS_ACCESS_DENIED; + goto hypercall_complete; } -#ifdef CONFIG_X86_64 - else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + + if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + goto hypercall_complete; } -#endif - code = param & 0xffff; - fast = !!(param & HV_HYPERCALL_FAST_BIT); - rep_cnt = (param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; - rep_idx = (param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; - rep = !!(rep_cnt || rep_idx); + if (hc.fast && is_xmm_fast_hypercall(&hc)) { + if (unlikely(hv_vcpu->enforce_cpuid && + !(hv_vcpu->cpuid_cache.features_edx & + HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } - trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + kvm_hv_hypercall_read_xmm(&hc); + } - switch (code) { + switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: - if (unlikely(rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: - if (unlikely(rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hvcall_signal_event(vcpu, fast, ingpa); + ret = kvm_hvcall_signal_event(vcpu, &hc); if (ret != HV_STATUS_INVALID_PORT_ID) break; - /* maybe userspace knows this conn_id: fall through */ + fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ - if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) { + if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = param; - vcpu->run->hyperv.u.hcall.params[0] = ingpa; - vcpu->run->hyperv.u.hcall.params[1] = outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; + goto hypercall_userspace_exit; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: - if (unlikely(fast || !rep_cnt || rep_idx)) { + if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false); - break; - case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: - if (unlikely(fast || rep)) { + fallthrough; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: + if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false); + ret = kvm_hv_flush_tlb(vcpu, &hc); break; - case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: - if (unlikely(fast || !rep_cnt || rep_idx)) { + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: + if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); - break; + fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: - if (unlikely(fast || rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); + ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_SEND_IPI: - if (unlikely(rep)) { + if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast); - break; + fallthrough; case HVCALL_SEND_IPI_EX: - if (unlikely(fast || rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false); + ret = kvm_hv_send_ipi(vcpu, &hc); break; + case HVCALL_POST_DEBUG_DATA: + case HVCALL_RETRIEVE_DEBUG_DATA: + if (unlikely(hc.fast)) { + ret = HV_STATUS_INVALID_PARAMETER; + break; + } + fallthrough; + case HVCALL_RESET_DEBUG_SESSION: { + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu)) { + ret = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) { + ret = HV_STATUS_OPERATION_DENIED; + break; + } + goto hypercall_userspace_exit; + } + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + if (unlikely(hc.fast)) { + ret = HV_STATUS_INVALID_PARAMETER; + break; + } + goto hypercall_userspace_exit; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } +hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); + +hypercall_userspace_exit: + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; + vcpu->run->hyperv.u.hcall.input = hc.param; + vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; + vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; + vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; + return 0; } void kvm_hv_init_vm(struct kvm *kvm) { - mutex_init(&kvm->arch.hyperv.hv_lock); - idr_init(&kvm->arch.hyperv.conn_to_evt); + struct kvm_hv *hv = to_kvm_hv(kvm); + + mutex_init(&hv->hv_lock); + idr_init(&hv->conn_to_evt); } void kvm_hv_destroy_vm(struct kvm *kvm) { + struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int i; - idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i) + idr_for_each_entry(&hv->conn_to_evt, eventfd, i) eventfd_ctx_put(eventfd); - idr_destroy(&kvm->arch.hyperv.conn_to_evt); + idr_destroy(&hv->conn_to_evt); } static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int ret; @@ -1729,7 +2725,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) mutex_lock(&hv->hv_lock); ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); mutex_unlock(&hv->hv_lock); if (ret >= 0) @@ -1743,7 +2739,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; mutex_lock(&hv->hv_lock); @@ -1769,10 +2765,10 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); } -int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { - uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu); + uint16_t evmcs_ver = 0; struct kvm_cpuid_entry2 cpuid_entries[] = { { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_INTERFACE }, @@ -1780,13 +2776,15 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, + { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, + { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, + { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); - /* Skip NESTED_FEATURES if eVMCS is not supported */ - if (!evmcs_ver) - --nent; + if (kvm_x86_ops.nested_ops->get_evmcs_version) + evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); if (cpuid->nent < nent) return -E2BIG; @@ -1802,15 +2800,14 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); - ent->eax = HYPERV_CPUID_NESTED_FEATURES; + ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; break; case HYPERV_CPUID_INTERFACE: - memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12); - ent->eax = signature[0]; + ent->eax = HYPERV_CPUID_SIGNATURE_EAX; break; case HYPERV_CPUID_VERSION: @@ -1823,37 +2820,54 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, break; case HYPERV_CPUID_FEATURES: - ent->eax |= HV_X64_MSR_VP_RUNTIME_AVAILABLE; + ent->eax |= HV_MSR_VP_RUNTIME_AVAILABLE; ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; - ent->eax |= HV_X64_MSR_SYNIC_AVAILABLE; + ent->eax |= HV_MSR_SYNIC_AVAILABLE; ent->eax |= HV_MSR_SYNTIMER_AVAILABLE; - ent->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE; - ent->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE; - ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE; - ent->eax |= HV_X64_MSR_RESET_AVAILABLE; + ent->eax |= HV_MSR_APIC_ACCESS_AVAILABLE; + ent->eax |= HV_MSR_HYPERCALL_AVAILABLE; + ent->eax |= HV_MSR_VP_INDEX_AVAILABLE; + ent->eax |= HV_MSR_RESET_AVAILABLE; ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; - ent->eax |= HV_X64_MSR_GUEST_IDLE_AVAILABLE; - ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS; - ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT; + ent->eax |= HV_ACCESS_FREQUENCY_MSRS; + ent->eax |= HV_ACCESS_REENLIGHTENMENT; + ent->eax |= HV_ACCESS_TSC_INVARIANT; - ent->ebx |= HV_X64_POST_MESSAGES; - ent->ebx |= HV_X64_SIGNAL_EVENTS; + ent->ebx |= HV_POST_MESSAGES; + ent->ebx |= HV_SIGNAL_EVENTS; + ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; + ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; - ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + + ent->ebx |= HV_DEBUGGING; + ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; + ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH; + + /* + * Direct Synthetic timers only make sense with in-kernel + * LAPIC + */ + if (!vcpu || lapic_in_kernel(vcpu)) + ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; case HYPERV_CPUID_ENLIGHTMENT_INFO: ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; - ent->eax |= HV_X64_SYSTEM_RESET_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; - ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; + if (evmcs_ver) + ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; + if (!cpu_smt_possible()) + ent->eax |= HV_X64_NO_NONARCH_CORESHARING; + ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. @@ -1875,7 +2889,27 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; + ent->eax |= HV_X64_NESTED_DIRECT_FLUSH; + ent->eax |= HV_X64_NESTED_MSR_BITMAP; + ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; + break; + + case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: + memcpy(signature, "Linux KVM Hv", 12); + + ent->eax = 0; + ent->ebx = signature[0]; + ent->ecx = signature[1]; + ent->edx = signature[2]; + break; + + case HYPERV_CPUID_SYNDBG_INTERFACE: + memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); + ent->eax = signature[0]; + break; + case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: + ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; break; default: |
