diff options
Diffstat (limited to 'arch/x86/kvm/xen.c')
| -rw-r--r-- | arch/x86/kvm/xen.c | 1194 |
1 files changed, 841 insertions, 353 deletions
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 2dae413bd62a..d6b2a665b499 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -5,11 +5,12 @@ * * KVM Xen emulation */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "xen.h" #include "hyperv.h" -#include "lapic.h" +#include "irq.h" #include <linux/eventfd.h> #include <linux/kvm_host.h> @@ -22,6 +23,10 @@ #include <xen/interface/event_channel.h> #include <xen/interface/sched.h> +#include <asm/xen/cpuid.h> +#include <asm/pvclock.h> + +#include "cpuid.h" #include "trace.h" static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm); @@ -30,42 +35,32 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r); DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); -static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) +static int kvm_xen_shared_info_init(struct kvm *kvm) { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; struct pvclock_wall_clock *wc; - gpa_t gpa = gfn_to_gpa(gfn); u32 *wc_sec_hi; u32 wc_version; u64 wall_nsec; int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - if (gfn == GPA_INVALID) { - kvm_gpc_deactivate(kvm, gpc); - goto out; - } + read_lock_irq(&gpc->lock); + while (!kvm_gpc_check(gpc, PAGE_SIZE)) { + read_unlock_irq(&gpc->lock); - do { - ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa, - PAGE_SIZE); + ret = kvm_gpc_refresh(gpc, PAGE_SIZE); if (ret) goto out; - /* - * This code mirrors kvm_write_wall_clock() except that it writes - * directly through the pfn cache and doesn't mark the page dirty. - */ - wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); - - /* It could be invalid again already, so we need to check */ read_lock_irq(&gpc->lock); + } - if (gpc->valid) - break; - - read_unlock_irq(&gpc->lock); - } while (1); + /* + * This code mirrors kvm_write_wall_clock() except that it writes + * directly through the pfn cache and doesn't mark the page dirty. + */ + wall_nsec = kvm_get_wall_clock_epoch(kvm); /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -95,7 +90,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) wc_version = wc->version = (wc->version + 1) | 1; smp_wmb(); - wc->nsec = do_div(wall_nsec, 1000000000); + wc->nsec = do_div(wall_nsec, NSEC_PER_SEC); wc->sec = (u32)wall_nsec; *wc_sec_hi = wall_nsec >> 32; smp_wmb(); @@ -131,9 +126,23 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) { struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, arch.xen.timer); + struct kvm_xen_evtchn e; + int rc; + if (atomic_read(&vcpu->arch.xen.timer_pending)) return HRTIMER_NORESTART; + e.vcpu_id = vcpu->vcpu_id; + e.vcpu_idx = vcpu->vcpu_idx; + e.port = vcpu->arch.xen.timer_virq; + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm); + if (rc != -EWOULDBLOCK) { + vcpu->arch.xen.timer_expires = 0; + return HRTIMER_NORESTART; + } + atomic_inc(&vcpu->arch.xen.timer_pending); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); @@ -141,141 +150,213 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) +static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu, + struct pvclock_vcpu_time_info *hv_clock, + struct gfn_to_pfn_cache *gpc, + unsigned int offset) { - atomic_set(&vcpu->arch.xen.timer_pending, 0); - vcpu->arch.xen.timer_expires = guest_abs; + unsigned long flags; + int r; - if (delta_ns <= 0) { - xen_timer_callback(&vcpu->arch.xen.timer); - } else { - ktime_t ktime_now = ktime_get(); - hrtimer_start(&vcpu->arch.xen.timer, - ktime_add_ns(ktime_now, delta_ns), - HRTIMER_MODE_ABS_HARD); + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) { + read_unlock_irqrestore(&gpc->lock, flags); + + r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock)); + if (r) + return r; + + read_lock_irqsave(&gpc->lock, flags); } -} -static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) -{ - hrtimer_cancel(&vcpu->arch.xen.timer); - vcpu->arch.xen.timer_expires = 0; - atomic_set(&vcpu->arch.xen.timer_pending, 0); -} + memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock)); + read_unlock_irqrestore(&gpc->lock, flags); -static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) -{ - hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_HARD); - vcpu->arch.xen.timer.function = xen_timer_callback; + /* + * Sanity check TSC shift+multiplier to verify the guest's view of time + * is more or less consistent. + */ + if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift || + hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul) + return -EINVAL; + + return 0; } -static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) +static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, + bool linux_wa) { - struct kvm_vcpu_xen *vx = &v->arch.xen; - u64 now = get_kvmclock_ns(v->kvm); - u64 delta_ns = now - vx->runstate_entry_time; - u64 run_delay = current->sched_info.run_delay; - - if (unlikely(!vx->runstate_entry_time)) - vx->current_runstate = RUNSTATE_offline; + struct kvm_vcpu_xen *xen = &vcpu->arch.xen; + int64_t kernel_now, delta; + uint64_t guest_now; + int r = -EOPNOTSUPP; /* - * Time waiting for the scheduler isn't "stolen" if the - * vCPU wasn't running anyway. + * The guest provides the requested timeout in absolute nanoseconds + * of the KVM clock — as *it* sees it, based on the scaled TSC and + * the pvclock information provided by KVM. + * + * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW + * so use CLOCK_MONOTONIC. In the timescales covered by timers, the + * difference won't matter much as there is no cumulative effect. + * + * Calculate the time for some arbitrary point in time around "now" + * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the + * delta between the kvmclock "now" value and the guest's requested + * timeout, apply the "Linux workaround" described below, and add + * the resulting delta to the CLOCK_MONOTONIC "now" value, to get + * the absolute CLOCK_MONOTONIC time at which the timer should + * fire. */ - if (vx->current_runstate == RUNSTATE_running) { - u64 steal_ns = run_delay - vx->last_steal; + do { + struct pvclock_vcpu_time_info hv_clock; + uint64_t host_tsc, guest_tsc; - delta_ns -= steal_ns; + if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) || + !vcpu->kvm->arch.use_master_clock) + break; - vx->runstate_times[RUNSTATE_runnable] += steal_ns; - } - vx->last_steal = run_delay; + /* + * If both Xen PV clocks are active, arbitrarily try to use the + * compat clock first, but also try to use the non-compat clock + * if the compat clock is unusable. The two PV clocks hold the + * same information, but it's possible one (or both) is stale + * and/or currently unreachable. + */ + if (xen->vcpu_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (r && xen->vcpu_time_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0); + if (r) + break; - vx->runstate_times[vx->current_runstate] += delta_ns; - vx->current_runstate = state; - vx->runstate_entry_time = now; -} + if (!IS_ENABLED(CONFIG_64BIT) || + !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) { + /* + * Don't fall back to get_kvmclock_ns() because it's + * broken; it has a systemic error in its results + * because it scales directly from host TSC to + * nanoseconds, and doesn't scale first to guest TSC + * and *then* to nanoseconds as the guest does. + * + * There is a small error introduced here because time + * continues to elapse between the ktime_get() and the + * subsequent rdtsc(). But not the systemic drift due + * to get_kvmclock_ns(). + */ + kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */ + host_tsc = rdtsc(); + } -void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) -{ - struct kvm_vcpu_xen *vx = &v->arch.xen; - struct gfn_to_pfn_cache *gpc = &vx->runstate_cache; - uint64_t *user_times; - unsigned long flags; - size_t user_len; - int *user_state; + /* Calculate the guest kvmclock as the guest would do it. */ + guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); + guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc); + } while (0); - kvm_xen_update_runstate(v, state); + if (r) { + /* + * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. + * + * Also if the guest PV clock hasn't been set up yet, as is + * likely to be the case during migration when the vCPU has + * not been run yet. It would be possible to calculate the + * scaling factors properly in that case but there's not much + * point in doing so. The get_kvmclock_ns() drift accumulates + * over time, so it's OK to use it at startup. Besides, on + * migration there's going to be a little bit of skew in the + * precise moment at which timers fire anyway. Often they'll + * be in the "past" by the time the VM is running again after + * migration. + */ + guest_now = get_kvmclock_ns(vcpu->kvm); + kernel_now = ktime_get(); + } - if (!vx->runstate_cache.active) - return; + delta = guest_abs - guest_now; - if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) - user_len = sizeof(struct vcpu_runstate_info); - else - user_len = sizeof(struct compat_vcpu_runstate_info); + /* + * Xen has a 'Linux workaround' in do_set_timer_op() which checks for + * negative absolute timeout values (caused by integer overflow), and + * for values about 13 days in the future (2^50ns) which would be + * caused by jiffies overflow. For those cases, Xen sets the timeout + * 100ms in the future (not *too* soon, since if a guest really did + * set a long timeout on purpose we don't want to keep churning CPU + * time by waking it up). Emulate Xen's workaround when starting the + * timer in response to __HYPERVISOR_set_timer_op. + */ + if (linux_wa && + unlikely((int64_t)guest_abs < 0 || + (delta > 0 && (uint32_t) (delta >> 50) != 0))) { + delta = 100 * NSEC_PER_MSEC; + guest_abs = guest_now + delta; + } - read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - user_len)) { - read_unlock_irqrestore(&gpc->lock, flags); + /* + * Avoid races with the old timer firing. Checking timer_expires + * to avoid calling hrtimer_cancel() will only have false positives + * so is fine. + */ + if (vcpu->arch.xen.timer_expires) + hrtimer_cancel(&vcpu->arch.xen.timer); - /* When invoked from kvm_sched_out() we cannot sleep */ - if (state == RUNSTATE_runnable) - return; + atomic_set(&vcpu->arch.xen.timer_pending, 0); + vcpu->arch.xen.timer_expires = guest_abs; - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len)) - return; + if (delta <= 0) + xen_timer_callback(&vcpu->arch.xen.timer); + else + hrtimer_start(&vcpu->arch.xen.timer, + ktime_add_ns(kernel_now, delta), + HRTIMER_MODE_ABS_HARD); +} - read_lock_irqsave(&gpc->lock, flags); - } +static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) +{ + hrtimer_cancel(&vcpu->arch.xen.timer); + vcpu->arch.xen.timer_expires = 0; + atomic_set(&vcpu->arch.xen.timer_pending, 0); +} + +static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) +{ + struct kvm_vcpu_xen *vx = &v->arch.xen; + struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache; + struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache; + size_t user_len, user_len1, user_len2; + struct vcpu_runstate_info rs; + unsigned long flags; + size_t times_ofs; + uint8_t *update_bit = NULL; + uint64_t entry_time; + uint64_t *rs_times; + int *rs_state; /* * The only difference between 32-bit and 64-bit versions of the - * runstate struct us the alignment of uint64_t in 32-bit, which + * runstate struct is the alignment of uint64_t in 32-bit, which * means that the 64-bit version has an additional 4 bytes of - * padding after the first field 'state'. - * - * So we use 'int __user *user_state' to point to the state field, - * and 'uint64_t __user *user_times' for runstate_entry_time. So - * the actual array of time[] in each state starts at user_times[1]. + * padding after the first field 'state'. Let's be really really + * paranoid about that, and matching it with our internal data + * structures that we memcpy into it... */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); #ifdef CONFIG_X86_64 + /* + * The 64-bit structure has 4 bytes of padding before 'state_entry_time' + * so each subsequent field is shifted by 4, and it's 4 bytes longer. + */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); + BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4); #endif - - user_state = gpc->khva; - - if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) - user_times = gpc->khva + offsetof(struct vcpu_runstate_info, - state_entry_time); - else - user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info, - state_entry_time); - /* - * First write the updated state_entry_time at the appropriate - * location determined by 'offset'. - */ - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != - sizeof(user_times[0])); - BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != - sizeof(user_times[0])); - - user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; - smp_wmb(); - - /* - * Next, write the new runstate. This is in the *same* place - * for 32-bit and 64-bit guests, asserted here for paranoia. + * The state field is in the same place at the start of both structs, + * and is the same size (int) as vx->current_runstate. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != offsetof(struct compat_vcpu_runstate_info, state)); @@ -284,40 +365,259 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - *user_state = vx->current_runstate; + /* + * The state_entry_time field is 64 bits in both versions, and the + * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86 + * is little-endian means that it's in the last *byte* of the word. + * That detail is important later. + */ + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80); /* - * Write the actual runstate times immediately after the - * runstate_entry_time. + * The time array is four 64-bit quantities in both versions, matching + * the vx->runstate_times and immediately following state_entry_time. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != - offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); + offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != - offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); + offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof_field(struct compat_vcpu_runstate_info, time)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); - memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); - smp_wmb(); + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { + user_len = sizeof(struct vcpu_runstate_info); + times_ofs = offsetof(struct vcpu_runstate_info, + state_entry_time); + } else { + user_len = sizeof(struct compat_vcpu_runstate_info); + times_ofs = offsetof(struct compat_vcpu_runstate_info, + state_entry_time); + } + + /* + * There are basically no alignment constraints. The guest can set it + * up so it crosses from one page to the next, and at arbitrary byte + * alignment (and the 32-bit ABI doesn't align the 64-bit integers + * anyway, even if the overall struct had been 64-bit aligned). + */ + if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) { + user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK); + user_len2 = user_len - user_len1; + } else { + user_len1 = user_len; + user_len2 = 0; + } + BUG_ON(user_len1 + user_len2 != user_len); + + retry: + /* + * Attempt to obtain the GPC lock on *both* (if there are two) + * gfn_to_pfn caches that cover the region. + */ + if (atomic) { + local_irq_save(flags); + if (!read_trylock(&gpc1->lock)) { + local_irq_restore(flags); + return; + } + } else { + read_lock_irqsave(&gpc1->lock, flags); + } + while (!kvm_gpc_check(gpc1, user_len1)) { + read_unlock_irqrestore(&gpc1->lock, flags); + + /* When invoked from kvm_sched_out() we cannot sleep */ + if (atomic) + return; + + if (kvm_gpc_refresh(gpc1, user_len1)) + return; + + read_lock_irqsave(&gpc1->lock, flags); + } + + if (likely(!user_len2)) { + /* + * Set up three pointers directly to the runstate_info + * struct in the guest (via the GPC). + * + * • @rs_state → state field + * • @rs_times → state_entry_time field. + * • @update_bit → last byte of state_entry_time, which + * contains the XEN_RUNSTATE_UPDATE bit. + */ + rs_state = gpc1->khva; + rs_times = gpc1->khva + times_ofs; + if (v->kvm->arch.xen.runstate_update_flag) + update_bit = ((void *)(&rs_times[1])) - 1; + } else { + /* + * The guest's runstate_info is split across two pages and we + * need to hold and validate both GPCs simultaneously. We can + * declare a lock ordering GPC1 > GPC2 because nothing else + * takes them more than one at a time. Set a subclass on the + * gpc1 lock to make lockdep shut up about it. + */ + lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_); + if (atomic) { + if (!read_trylock(&gpc2->lock)) { + read_unlock_irqrestore(&gpc1->lock, flags); + return; + } + } else { + read_lock(&gpc2->lock); + } + + if (!kvm_gpc_check(gpc2, user_len2)) { + read_unlock(&gpc2->lock); + read_unlock_irqrestore(&gpc1->lock, flags); + + /* When invoked from kvm_sched_out() we cannot sleep */ + if (atomic) + return; + + /* + * Use kvm_gpc_activate() here because if the runstate + * area was configured in 32-bit mode and only extends + * to the second page now because the guest changed to + * 64-bit mode, the second GPC won't have been set up. + */ + if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1, + user_len2)) + return; + + /* + * We dropped the lock on GPC1 so we have to go all the + * way back and revalidate that too. + */ + goto retry; + } + + /* + * In this case, the runstate_info struct will be assembled on + * the kernel stack (compat or not as appropriate) and will + * be copied to GPC1/GPC2 with a dual memcpy. Set up the three + * rs pointers accordingly. + */ + rs_times = &rs.state_entry_time; + + /* + * The rs_state pointer points to the start of what we'll + * copy to the guest, which in the case of a compat guest + * is the 32-bit field that the compiler thinks is padding. + */ + rs_state = ((void *)rs_times) - times_ofs; + + /* + * The update_bit is still directly in the guest memory, + * via one GPC or the other. + */ + if (v->kvm->arch.xen.runstate_update_flag) { + if (user_len1 >= times_ofs + sizeof(uint64_t)) + update_bit = gpc1->khva + times_ofs + + sizeof(uint64_t) - 1; + else + update_bit = gpc2->khva + times_ofs + + sizeof(uint64_t) - 1 - user_len1; + } + +#ifdef CONFIG_X86_64 + /* + * Don't leak kernel memory through the padding in the 64-bit + * version of the struct. + */ + memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time)); +#endif + } + + /* + * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the + * state_entry_time field, directly in the guest. We need to set + * that (and write-barrier) before writing to the rest of the + * structure, and clear it last. Just as Xen does, we address the + * single *byte* in which it resides because it might be in a + * different cache line to the rest of the 64-bit word, due to + * the (lack of) alignment constraints. + */ + entry_time = vx->runstate_entry_time; + if (update_bit) { + entry_time |= XEN_RUNSTATE_UPDATE; + *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; + smp_wmb(); + } /* - * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's - * runstate_entry_time field. + * Now assemble the actual structure, either on our kernel stack + * or directly in the guest according to how the rs_state and + * rs_times pointers were set up above. */ - user_times[0] &= ~XEN_RUNSTATE_UPDATE; + *rs_state = vx->current_runstate; + rs_times[0] = entry_time; + memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); + + /* For the split case, we have to then copy it to the guest. */ + if (user_len2) { + memcpy(gpc1->khva, rs_state, user_len1); + memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2); + } smp_wmb(); - read_unlock_irqrestore(&gpc->lock, flags); + /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */ + if (update_bit) { + entry_time &= ~XEN_RUNSTATE_UPDATE; + *update_bit = entry_time >> 56; + smp_wmb(); + } + + if (user_len2) { + kvm_gpc_mark_dirty_in_slot(gpc2); + read_unlock(&gpc2->lock); + } + + kvm_gpc_mark_dirty_in_slot(gpc1); + read_unlock_irqrestore(&gpc1->lock, flags); +} - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); +void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) +{ + struct kvm_vcpu_xen *vx = &v->arch.xen; + u64 now = get_kvmclock_ns(v->kvm); + u64 delta_ns = now - vx->runstate_entry_time; + u64 run_delay = current->sched_info.run_delay; + + if (unlikely(!vx->runstate_entry_time)) + vx->current_runstate = RUNSTATE_offline; + + /* + * Time waiting for the scheduler isn't "stolen" if the + * vCPU wasn't running anyway. + */ + if (vx->current_runstate == RUNSTATE_running) { + u64 steal_ns = run_delay - vx->last_steal; + + delta_ns -= steal_ns; + + vx->runstate_times[RUNSTATE_runnable] += steal_ns; + } + vx->last_steal = run_delay; + + vx->runstate_times[vx->current_runstate] += delta_ns; + vx->current_runstate = state; + vx->runstate_entry_time = now; + + if (vx->runstate_cache.active) + kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable); } -static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) +void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) { struct kvm_lapic_irq irq = { }; - int r; irq.dest_id = v->vcpu_id; irq.vector = v->arch.xen.upcall_vector; @@ -326,8 +626,7 @@ static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) irq.delivery_mode = APIC_DM_FIXED; irq.level = 1; - /* The fast version will always work for physical unicast */ - WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL)); + kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL); } /* @@ -352,12 +651,10 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) * little more honest about it. */ read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) + if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) return; read_lock_irqsave(&gpc->lock, flags); @@ -388,13 +685,13 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) : "0" (evtchn_pending_sel32)); WRITE_ONCE(vi->evtchn_upcall_pending, 1); } + + kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); /* For the per-vCPU lapic vector, deliver it as MSI. */ if (v->arch.xen.upcall_vector) kvm_xen_inject_vcpu_vector(v); - - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); } int __kvm_xen_has_interrupt(struct kvm_vcpu *v) @@ -417,8 +714,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); /* @@ -432,8 +728,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) if (in_atomic() || !task_is_running(current)) return 1; - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) { /* * If this failed, userspace has screwed up the * vcpu_info mapping. No interrupts for you. @@ -458,26 +753,68 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { r = -EINVAL; } else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.long_mode = !!data->u.long_mode; - mutex_unlock(&kvm->lock); - r = 0; + + /* + * Re-initialize shared_info to put the wallclock in the + * correct place. Whilst it's not necessary to do this + * unless the mode is actually changed, it does no harm + * to make the call anyway. + */ + r = kvm->arch.xen.shinfo_cache.active ? + kvm_xen_shared_info_init(kvm) : 0; + mutex_unlock(&kvm->arch.xen.xen_lock); } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - mutex_lock(&kvm->lock); - r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); - mutex_unlock(&kvm->lock); - break; + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { + int idx; + + mutex_lock(&kvm->arch.xen.xen_lock); + + idx = srcu_read_lock(&kvm->srcu); + + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { + gfn_t gfn = data->u.shared_info.gfn; + if (gfn == KVM_XEN_INVALID_GFN) { + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); + r = 0; + } else { + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, + gfn_to_gpa(gfn), PAGE_SIZE); + } + } else { + void __user * hva = u64_to_user_ptr(data->u.shared_info.hva); + + if (!PAGE_ALIGNED(hva)) { + r = -EINVAL; + } else if (!hva) { + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); + r = 0; + } else { + r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache, + (unsigned long)hva, PAGE_SIZE); + } + } + + srcu_read_unlock(&kvm->srcu, idx); + + if (!r && kvm->arch.xen.shinfo_cache.active) + r = kvm_xen_shared_info_init(kvm); + + mutex_unlock(&kvm->arch.xen.xen_lock); + break; + } case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.upcall_vector = data->u.vector; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; @@ -487,9 +824,20 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_XEN_VERSION: - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.xen_version = data->u.xen_version; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); + r = 0; + break; + + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + mutex_lock(&kvm->arch.xen.xen_lock); + kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; @@ -504,7 +852,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: @@ -513,10 +861,18 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_cache.active) + if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache)) data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); else - data->u.shared_info.gfn = GPA_INVALID; + data->u.shared_info.gfn = KVM_XEN_INVALID_GFN; + r = 0; + break; + + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: + if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache)) + data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva; + else + data->u.shared_info.hva = 0; r = 0; break; @@ -530,11 +886,20 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag; + r = 0; + break; + default: break; } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return r; } @@ -542,65 +907,104 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int idx, r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); idx = srcu_read_lock(&vcpu->kvm->srcu); switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: /* No compat necessary here. */ BUILD_BUG_ON(sizeof(struct vcpu_info) != sizeof(struct compat_vcpu_info)); BUILD_BUG_ON(offsetof(struct vcpu_info, time) != offsetof(struct compat_vcpu_info, time)); - if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); - r = 0; - break; + if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) { + if (data->u.gpa == KVM_XEN_INVALID_GPA) { + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + r = 0; + break; + } + + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, + data->u.gpa, sizeof(struct vcpu_info)); + } else { + if (data->u.hva == 0) { + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + r = 0; + break; + } + + r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache, + data->u.hva, sizeof(struct vcpu_info)); } - r = kvm_gpc_activate(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache, NULL, - KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: - if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + if (data->u.gpa == KVM_XEN_INVALID_GPA) { + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - r = kvm_gpc_activate(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache, + data->u.gpa, sizeof(struct pvclock_vcpu_time_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; - case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: { + size_t sz, sz1, sz2; + if (!sched_info_on()) { r = -EOPNOTSUPP; break; } - if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); + if (data->u.gpa == KVM_XEN_INVALID_GPA) { r = 0; + deactivate_out: + kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); break; } - r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_runstate_info)); - break; + /* + * If the guest switches to 64-bit mode after setting the runstate + * address, that's actually OK. kvm_xen_update_runstate_guest() + * will cope. + */ + if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) + sz = sizeof(struct vcpu_runstate_info); + else + sz = sizeof(struct compat_vcpu_runstate_info); + + /* How much fits in the (first) page? */ + sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK); + r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache, + data->u.gpa, sz1); + if (r) + goto deactivate_out; + + /* Either map the second page, or deactivate the second GPC */ + if (sz1 >= sz) { + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); + } else { + sz2 = sz - sz1; + BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK); + r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache, + data->u.gpa + sz1, sz2); + if (r) + goto deactivate_out; + } + kvm_xen_update_runstate_guest(vcpu, false); + break; + } case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: if (!sched_info_on()) { r = -EOPNOTSUPP; @@ -693,6 +1097,8 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) if (data->u.runstate.state <= RUNSTATE_offline) kvm_xen_update_runstate(vcpu, data->u.runstate.state); + else if (vcpu->arch.xen.runstate_cache.active) + kvm_xen_update_runstate_guest(vcpu, false); r = 0; break; @@ -712,18 +1118,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } - if (!vcpu->arch.xen.timer.function) - kvm_xen_init_timer(vcpu); - /* Stop the timer (if it's running) before changing the vector */ kvm_xen_stop_timer(vcpu); vcpu->arch.xen.timer_virq = data->u.timer.port; /* Start the timer if the new value has a valid vector+expiry. */ if (data->u.timer.port && data->u.timer.expires_ns) - kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, - data->u.timer.expires_ns - - get_kvmclock_ns(vcpu->kvm)); + kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false); r = 0; break; @@ -742,7 +1143,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) } srcu_read_unlock(&vcpu->kvm->srcu, idx); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -750,14 +1151,22 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: - if (vcpu->arch.xen.vcpu_info_cache.active) + if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache)) data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; else - data->u.gpa = GPA_INVALID; + data->u.gpa = KVM_XEN_INVALID_GPA; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: + if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache)) + data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva; + else + data->u.hva = 0; r = 0; break; @@ -765,7 +1174,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) if (vcpu->arch.xen.vcpu_time_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; else - data->u.gpa = GPA_INVALID; + data->u.gpa = KVM_XEN_INVALID_GPA; r = 0; break; @@ -818,9 +1227,36 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; case KVM_XEN_VCPU_ATTR_TYPE_TIMER: + /* + * Ensure a consistent snapshot of state is captured, with a + * timer either being pending, or the event channel delivered + * to the corresponding bit in the shared_info. Not still + * lurking in the timer_pending flag for deferred delivery. + * Purely as an optimisation, if the timer_expires field is + * zero, that means the timer isn't active (or even in the + * timer_pending flag) and there is no need to cancel it. + */ + if (vcpu->arch.xen.timer_expires) { + hrtimer_cancel(&vcpu->arch.xen.timer); + kvm_xen_inject_timer_irqs(vcpu); + } + data->u.timer.port = vcpu->arch.xen.timer_virq; data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; data->u.timer.expires_ns = vcpu->arch.xen.timer_expires; + + /* + * The hrtimer may trigger and raise the IRQ immediately, + * while the returned state causes it to be set up and + * raised again on the destination system after migration. + * That's fine, as the guest won't even have had a chance + * to run and handle the interrupt. Asserting an already + * pending event channel is idempotent. + */ + if (vcpu->arch.xen.timer_expires) + hrtimer_start_expires(&vcpu->arch.xen.timer, + HRTIMER_MODE_ABS_HARD); + r = 0; break; @@ -833,7 +1269,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -843,9 +1279,24 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) u32 page_num = data & ~PAGE_MASK; u64 page_addr = data & PAGE_MASK; bool lm = is_long_mode(vcpu); + int r = 0; - /* Latch long_mode for shared_info pages etc. */ - vcpu->kvm->arch.xen.long_mode = lm; + mutex_lock(&kvm->arch.xen.xen_lock); + if (kvm->arch.xen.long_mode != lm) { + kvm->arch.xen.long_mode = lm; + + /* + * Re-initialize shared_info to put the wallclock in the + * correct place. + */ + if (kvm->arch.xen.shinfo_cache.active && + kvm_xen_shared_info_init(kvm)) + r = 1; + } + mutex_unlock(&kvm->arch.xen.xen_lock); + + if (r) + return r; /* * If Xen hypercall intercept is enabled, fill the hypercall @@ -864,7 +1315,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) instructions[0] = 0xb8; /* vmcall / vmmcall */ - static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5); + kvm_x86_call(patch_hypercall)(vcpu, instructions + 5); /* ret */ instructions[8] = 0xc3; @@ -884,11 +1335,12 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) * Note, truncation is a non-issue as 'lm' is guaranteed to be * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes. */ - hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64 - : kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; + hva_t blob_addr = lm ? kvm->arch.xen.hvm_config.blob_addr_64 + : kvm->arch.xen.hvm_config.blob_addr_32; + u8 blob_size = lm ? kvm->arch.xen.hvm_config.blob_size_64 + : kvm->arch.xen.hvm_config.blob_size_32; u8 *page; + int ret; if (page_num >= blob_size) return 1; @@ -899,10 +1351,10 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) if (IS_ERR(page)) return PTR_ERR(page); - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { - kfree(page); + ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE); + kfree(page); + if (ret) return 1; - } } return 0; } @@ -911,7 +1363,9 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) { /* Only some feature flags need to be *enabled* by userspace */ u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | - KVM_XEN_HVM_CONFIG_EVTCHN_SEND; + KVM_XEN_HVM_CONFIG_EVTCHN_SEND | + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; + u32 old_flags; if (xhc->flags & ~permitted_flags) return -EINVAL; @@ -925,16 +1379,30 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) xhc->blob_size_32 || xhc->blob_size_64)) return -EINVAL; - mutex_lock(&kvm->lock); + /* + * Restrict the MSR to the range that is unofficially reserved for + * synthetic, virtualization-defined MSRs, e.g. to prevent confusing + * KVM by colliding with a real MSR that requires special handling. + */ + if (xhc->msr && + (xhc->msr < KVM_XEN_MSR_MIN_INDEX || xhc->msr > KVM_XEN_MSR_MAX_INDEX)) + return -EINVAL; - if (xhc->msr && !kvm->arch.xen_hvm_config.msr) + mutex_lock(&kvm->arch.xen.xen_lock); + + if (xhc->msr && !kvm->arch.xen.hvm_config.msr) static_branch_inc(&kvm_xen_enabled.key); - else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) + else if (!xhc->msr && kvm->arch.xen.hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); - memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); + old_flags = kvm->arch.xen.hvm_config.flags; + memcpy(&kvm->arch.xen.hvm_config, xhc, sizeof(*xhc)); + + mutex_unlock(&kvm->arch.xen.xen_lock); + + if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); - mutex_unlock(&kvm->lock); return 0; } @@ -954,6 +1422,14 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu) return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result); } +static inline int max_evtchn_port(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) + return EVTCHN_2L_NR_CHANNELS; + else + return COMPAT_EVTCHN_2L_NR_CHANNELS; +} + static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, evtchn_port_t *ports) { @@ -964,9 +1440,9 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, bool ret = true; int idx, i; - read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + read_lock_irqsave(&gpc->lock, flags); + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; ret = false; @@ -986,8 +1462,8 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, } out_rcu: - srcu_read_unlock(&kvm->srcu, idx); read_unlock_irqrestore(&gpc->lock, flags); + srcu_read_unlock(&kvm->srcu, idx); return ret; } @@ -995,23 +1471,40 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, u64 param, u64 *r) { - int idx, i; struct sched_poll sched_poll; evtchn_port_t port, *ports; - gpa_t gpa; + struct x86_exception e; + int i; - if (!longmode || !lapic_in_kernel(vcpu) || - !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) + if (!lapic_in_kernel(vcpu) || + !(vcpu->kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) return false; - idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + if (IS_ENABLED(CONFIG_64BIT) && !longmode) { + struct compat_sched_poll sp32; - if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, - sizeof(sched_poll))) { - *r = -EFAULT; - return true; + /* Sanity check that the compat struct definition is correct */ + BUILD_BUG_ON(sizeof(sp32) != 16); + + if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) { + *r = -EFAULT; + return true; + } + + /* + * This is a 32-bit pointer to an array of evtchn_port_t which + * are uint32_t, so once it's converted no further compat + * handling is needed. + */ + sched_poll.ports = (void *)(unsigned long)(sp32.ports); + sched_poll.nr_ports = sp32.nr_ports; + sched_poll.timeout = sp32.timeout; + } else { + if (kvm_read_guest_virt(vcpu, param, &sched_poll, + sizeof(sched_poll), &e)) { + *r = -EFAULT; + return true; + } } if (unlikely(sched_poll.nr_ports > 1)) { @@ -1030,16 +1523,15 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, } else ports = &port; + if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports, + sched_poll.nr_ports * sizeof(*ports), &e)) { + *r = -EFAULT; + goto out; + } + for (i = 0; i < sched_poll.nr_ports; i++) { - idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, - (gva_t)(sched_poll.ports + i), - NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, - &ports[i], sizeof(port))) { - *r = -EFAULT; + if (ports[i] >= max_evtchn_port(vcpu->kvm)) { + *r = -EINVAL; goto out; } } @@ -1052,7 +1544,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); if (sched_poll.timeout) mod_timer(&vcpu->arch.xen.poll_timer, @@ -1061,9 +1553,9 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, kvm_vcpu_halt(vcpu); if (sched_poll.timeout) - del_timer(&vcpu->arch.xen.poll_timer); + timer_delete(&vcpu->arch.xen.poll_timer); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } vcpu->arch.xen.poll_evtchn = 0; @@ -1079,7 +1571,8 @@ out: static void cancel_evtchn_poll(struct timer_list *t) { - struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); + struct kvm_vcpu *vcpu = timer_container_of(vcpu, t, + arch.xen.poll_timer); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); @@ -1113,9 +1606,7 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, int vcpu_id, u64 param, u64 *r) { struct vcpu_set_singleshot_timer oneshot; - s64 delta; - gpa_t gpa; - int idx; + struct x86_exception e; if (!kvm_xen_timer_enabled(vcpu)) return false; @@ -1126,9 +1617,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, *r = -EINVAL; return true; } - idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); /* * The only difference for 32-bit compat is the 4 bytes of @@ -1146,20 +1634,13 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) != sizeof_field(struct vcpu_set_singleshot_timer, flags)); - if (!gpa || - kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) : - sizeof(struct compat_vcpu_set_singleshot_timer))) { + if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) : + sizeof(struct compat_vcpu_set_singleshot_timer), &e)) { *r = -EFAULT; return true; } - delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); - if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) { - *r = -ETIME; - return true; - } - - kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); + kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false); *r = 0; return true; @@ -1182,29 +1663,10 @@ static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout, if (!kvm_xen_timer_enabled(vcpu)) return false; - if (timeout) { - uint64_t guest_now = get_kvmclock_ns(vcpu->kvm); - int64_t delta = timeout - guest_now; - - /* Xen has a 'Linux workaround' in do_set_timer_op() which - * checks for negative absolute timeout values (caused by - * integer overflow), and for values about 13 days in the - * future (2^50ns) which would be caused by jiffies - * overflow. For those cases, it sets the timeout 100ms in - * the future (not *too* soon, since if a guest really did - * set a long timeout on purpose we don't want to keep - * churning CPU time by waking it up). - */ - if (unlikely((int64_t)timeout < 0 || - (delta > 0 && (uint32_t) (delta >> 50) != 0))) { - delta = 100 * NSEC_PER_MSEC; - timeout = guest_now + delta; - } - - kvm_xen_start_timer(vcpu, timeout, delta); - } else { + if (timeout) + kvm_xen_start_timer(vcpu, timeout, true); + else kvm_xen_stop_timer(vcpu); - } *r = 0; return true; @@ -1215,6 +1677,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) bool longmode; u64 input, params[6], r = -ENOSYS; bool handled = false; + u8 cpl; input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); @@ -1242,9 +1705,17 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) params[5] = (u64)kvm_r9_read(vcpu); } #endif - trace_kvm_xen_hypercall(input, params[0], params[1], params[2], + cpl = kvm_x86_call(get_cpl)(vcpu); + trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2], params[3], params[4], params[5]); + /* + * Only allow hypercall acceleration for CPL0. The rare hypercalls that + * are permitted in guest userspace can be handled by the VMM. + */ + if (unlikely(cpl > 0)) + goto handle_in_userspace; + switch (input) { case __HYPERVISOR_xen_version: if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) { @@ -1279,10 +1750,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) if (handled) return kvm_xen_hypercall_set_result(vcpu, r); +handle_in_userspace: vcpu->run->exit_reason = KVM_EXIT_XEN; vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; vcpu->run->xen.u.hcall.longmode = longmode; - vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu); + vcpu->run->xen.u.hcall.cpl = cpl; vcpu->run->xen.u.hcall.input = input; vcpu->run->xen.u.hcall.params[0] = params[0]; vcpu->run->xen.u.hcall.params[1] = params[1]; @@ -1297,14 +1769,6 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) return 0; } -static inline int max_evtchn_port(struct kvm *kvm) -{ - if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) - return EVTCHN_2L_NR_CHANNELS; - else - return COMPAT_EVTCHN_2L_NR_CHANNELS; -} - static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) { int poll_evtchn = vcpu->arch.xen.poll_evtchn; @@ -1346,9 +1810,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); } - if (!vcpu->arch.xen.vcpu_info_cache.active) - return -EINVAL; - if (xe->port >= max_evtchn_port(kvm)) return -EINVAL; @@ -1357,7 +1818,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) idx = srcu_read_lock(&kvm->srcu); read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { @@ -1391,7 +1852,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) gpc = &vcpu->arch.xen.vcpu_info_cache; read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) { + if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { /* * Could not access the vcpu_info. Set the bit in-kernel * and prod the vCPU to deliver it for itself. @@ -1457,16 +1918,6 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) } /* - * For the irqfd workqueue, using the main kvm->lock mutex is - * fine since this function is invoked from kvm_set_irq() with - * no other lock held, no srcu. In future if it will be called - * directly from a vCPU thread (e.g. on hypercall for an IPI) - * then it may need to switch to using a leaf-node mutex for - * serializing the shared_info mapping. - */ - mutex_lock(&kvm->lock); - - /* * It is theoretically possible for the page to be unmapped * and the MMU notifier to invalidate the shared_info before * we even get to use it. In that case, this looks like an @@ -1489,12 +1940,10 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) break; idx = srcu_read_lock(&kvm->srcu); - rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE); + rc = kvm_gpc_refresh(gpc, PAGE_SIZE); srcu_read_unlock(&kvm->srcu, idx); } while(!rc); - mutex_unlock(&kvm->lock); - if (mm_borrowed) kthread_unuse_mm(kvm->mm); @@ -1522,8 +1971,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, { struct kvm_vcpu *vcpu; - if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) - return -EINVAL; + /* + * Don't check for the port being within range of max_evtchn_port(). + * Userspace can configure what ever targets it likes; events just won't + * be delivered if/while the target is invalid, just like userspace can + * configure MSIs which target non-existent APICs. + * + * This allow on Live Migration and Live Update, the IRQ routing table + * can be restored *independently* of other things like creating vCPUs, + * without imposing an ordering dependency on userspace. In this + * particular case, the problematic ordering would be with setting the + * Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096 + * instead of 1024 event channels. + */ /* We only support 2 level event channels for now */ if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) @@ -1606,20 +2066,20 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, { u32 port = data->u.evtchn.send_port; struct evtchnfd *evtchnfd; + int ret; - if (!port || port >= max_evtchn_port(kvm)) - return -EINVAL; - - mutex_lock(&kvm->lock); + /* Protect writes to evtchnfd as well as the idr lookup. */ + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); - mutex_unlock(&kvm->lock); + ret = -ENOENT; if (!evtchnfd) - return -ENOENT; + goto out_unlock; /* For an UPDATE, nothing may change except the priority/vcpu */ + ret = -EINVAL; if (evtchnfd->type != data->u.evtchn.type) - return -EINVAL; + goto out_unlock; /* * Port cannot change, and if it's zero that was an eventfd @@ -1627,20 +2087,21 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, */ if (!evtchnfd->deliver.port.port || evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port) - return -EINVAL; + goto out_unlock; /* We only support 2 level event channels for now */ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) - return -EINVAL; + goto out_unlock; - mutex_lock(&kvm->lock); evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) { evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; evtchnfd->deliver.port.vcpu_idx = -1; } - mutex_unlock(&kvm->lock); - return 0; + ret = 0; +out_unlock: + mutex_unlock(&kvm->arch.xen.xen_lock); + return ret; } /* @@ -1652,12 +2113,9 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, { u32 port = data->u.evtchn.send_port; struct eventfd_ctx *eventfd = NULL; - struct evtchnfd *evtchnfd = NULL; + struct evtchnfd *evtchnfd; int ret = -EINVAL; - if (!port || port >= max_evtchn_port(kvm)) - return -EINVAL; - evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL); if (!evtchnfd) return -ENOMEM; @@ -1705,10 +2163,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1, GFP_KERNEL); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (ret >= 0) return 0; @@ -1726,15 +2184,14 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) { struct evtchnfd *evtchnfd; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (!evtchnfd) return -ENOENT; - if (kvm) - synchronize_srcu(&kvm->srcu); + synchronize_srcu(&kvm->srcu); if (!evtchnfd->deliver.port.port) eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); kfree(evtchnfd); @@ -1743,18 +2200,42 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) static int kvm_xen_eventfd_reset(struct kvm *kvm) { - struct evtchnfd *evtchnfd; + struct evtchnfd *evtchnfd, **all_evtchnfds; int i; + int n = 0; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); + + /* + * Because synchronize_srcu() cannot be called inside the + * critical section, first collect all the evtchnfd objects + * in an array as they are removed from evtchn_ports. + */ + idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) + n++; + + all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL); + if (!all_evtchnfds) { + mutex_unlock(&kvm->arch.xen.xen_lock); + return -ENOMEM; + } + + n = 0; idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { + all_evtchnfds[n++] = evtchnfd; idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); - synchronize_srcu(&kvm->srcu); + } + mutex_unlock(&kvm->arch.xen.xen_lock); + + synchronize_srcu(&kvm->srcu); + + while (n--) { + evtchnfd = all_evtchnfds[n]; if (!evtchnfd->deliver.port.port) eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); kfree(evtchnfd); } - mutex_unlock(&kvm->lock); + kfree(all_evtchnfds); return 0; } @@ -1783,20 +2264,22 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) { struct evtchnfd *evtchnfd; struct evtchn_send send; - gpa_t gpa; - int idx; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + struct x86_exception e; - if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) { + /* Sanity check: this structure is the same for 32-bit and 64-bit */ + BUILD_BUG_ON(sizeof(send) != 4); + if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) { *r = -EFAULT; return true; } - /* The evtchn_ports idr is protected by vcpu->kvm->srcu */ + /* + * evtchnfd is protected by kvm->srcu; the idr lookup instead + * is protected by RCU. + */ + rcu_read_lock(); evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port); + rcu_read_unlock(); if (!evtchnfd) return false; @@ -1805,7 +2288,7 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) if (ret < 0 && ret != -ENOTCONN) return false; } else { - eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1); + eventfd_signal(evtchnfd->deliver.eventfd.ctx); } *r = 0; @@ -1818,10 +2301,13 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xen.poll_evtchn = 0; timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); + hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); - kvm_gpc_init(&vcpu->arch.xen.runstate_cache); - kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache); - kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); + kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm); + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) @@ -1829,17 +2315,19 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_stop_timer(vcpu); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); - del_timer_sync(&vcpu->arch.xen.poll_timer); + timer_delete_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_init_vm(struct kvm *kvm) { + mutex_init(&kvm->arch.xen.xen_lock); idr_init(&kvm->arch.xen.evtchn_ports); - kvm_gpc_init(&kvm->arch.xen.shinfo_cache); + kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm); } void kvm_xen_destroy_vm(struct kvm *kvm) @@ -1847,7 +2335,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm) struct evtchnfd *evtchnfd; int i; - kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache); + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { if (!evtchnfd->deliver.port.port) @@ -1856,6 +2344,6 @@ void kvm_xen_destroy_vm(struct kvm *kvm) } idr_destroy(&kvm->arch.xen.evtchn_ports); - if (kvm->arch.xen_hvm_config.msr) + if (kvm->arch.xen.hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); } |
