From 1d487e9bf8ba66a7174c56a0029c54b1eca8f99c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Apr 2019 11:16:47 +0200 Subject: KVM: fix spectrev1 gadgets These were found with smatch, and then generalized when applicable. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/lapic.c') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 991fdf7fc17f..9bf70cf84564 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -138,6 +138,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, if (offset <= max_apic_id) { u8 cluster_size = min(max_apic_id - offset + 1, 16U); + offset = array_index_nospec(offset, map->max_apic_id + 1); *cluster = &map->phys_map[offset]; *mask = dest_id & (0xffff >> (16 - cluster_size)); } else { @@ -901,7 +902,8 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, if (irq->dest_id > map->max_apic_id) { *bitmap = 0; } else { - *dst = &map->phys_map[irq->dest_id]; + u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1); + *dst = &map->phys_map[dest_id]; *bitmap = 1; } return true; -- cgit From c09d65d9eab69985c75f98ed64541229f6fa9aa6 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Tue, 16 Apr 2019 20:36:34 +0300 Subject: KVM: x86: Consider LAPIC TSC-Deadline timer expired if deadline too short If guest sets MSR_IA32_TSCDEADLINE to value such that in host time-domain it's shorter than lapic_timer_advance_ns, we can reach a case that we call hrtimer_start() with expiration time set at the past. Because lapic_timer.timer is init with HRTIMER_MODE_ABS_PINNED, it is not allowed to run in softirq and therefore will never expire. To avoid such a scenario, verify that deadline expiration time is set on host time-domain further than (now + lapic_timer_advance_ns). A future patch can also consider adding a min_timer_deadline_ns module parameter, similar to min_timer_period_us to avoid races that amount of ns it takes to run logic could still call hrtimer_start() with expiration timer set at the past. Reviewed-by: Joao Martins Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/lapic.c') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9bf70cf84564..6cb990dbc15a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1542,9 +1542,12 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); + + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + + if (likely(tscdeadline > guest_tsc) && + likely(ns > lapic_timer_advance_ns)) { expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, -- cgit From 57bf67e73ce9bcce2258890f5abf2adf5f619f1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Apr 2019 10:15:31 -0700 Subject: KVM: lapic: Disable timer advancement if adaptive tuning goes haywire To minimize the latency of timer interrupts as observed by the guest, KVM adjusts the values it programs into the host timers to account for the host's overhead of programming and handling the timer event. Now that the timer advancement is automatically tuned during runtime, it's effectively unbounded by default, e.g. if KVM is running as L1 the advancement can measure in hundreds of milliseconds. Disable timer advancement if adaptive tuning yields an advancement of more than 5000ns, as large advancements can break reasonable assumptions of the guest, e.g. that a timer configured to fire after 1ms won't arrive on the next instruction. Although KVM busy waits to mitigate the case of a timer event arriving too early, complications can arise when shifting the interrupt too far, e.g. kvm-unit-test's vmx.interrupt test will fail when its "host" exits on interrupts as KVM may inject the INTR before the guest executes STI+HLT. Arguably the unit test is "broken" in the sense that delaying a timer interrupt by 1ms doesn't technically guarantee the interrupt will arrive after STI+HLT, but it's a reasonable assumption that KVM should support. Furthermore, an unbounded advancement also effectively unbounds the time spent busy waiting, e.g. if the guest programs a timer with a very large delay. 5000ns is a somewhat arbitrary threshold. When running on bare metal, which is the intended use case, timer advancement is expected to be in the general vicinity of 1000ns. 5000ns is high enough that false positives are unlikely, while not being so high as to negatively affect the host's performance/stability. Note, a future patch will enable userspace to disable KVM's adaptive tuning, which will allow priveleged userspace will to specifying an advancement value in excess of this arbitrary threshold in order to satisfy an abnormal use case. Cc: Liran Alon Cc: Wanpeng Li Cc: stable@vger.kernel.org Fixes: 3b8a5df6c4dc6 ("KVM: LAPIC: Tune lapic_timer_advance_ns automatically") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kvm/lapic.c') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6cb990dbc15a..bbe1402309e3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1522,6 +1522,10 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) } if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) lapic_timer_advance_adjust_done = true; + if (unlikely(lapic_timer_advance_ns > 5000)) { + lapic_timer_advance_ns = 0; + lapic_timer_advance_adjust_done = true; + } } } -- cgit From 39497d7660d9866a47a2dc9055672358da57ad3d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Apr 2019 10:15:32 -0700 Subject: KVM: lapic: Track lapic timer advance per vCPU Automatically adjusting the globally-shared timer advancement could corrupt the timer, e.g. if multiple vCPUs are concurrently adjusting the advancement value. That could be partially fixed by using a local variable for the arithmetic, but it would still be susceptible to a race when setting timer_advance_adjust_done. And because virtual_tsc_khz and tsc_scaling_ratio are per-vCPU, the correct calibration for a given vCPU may not apply to all vCPUs. Furthermore, lapic_timer_advance_ns is marked __read_mostly, which is effectively violated when finding a stable advancement takes an extended amount of timer. Opportunistically change the definition of lapic_timer_advance_ns to a u32 so that it matches the style of struct kvm_timer. Explicitly pass the param to kvm_create_lapic() so that it doesn't have to be exposed to lapic.c, thus reducing the probability of unintentionally using the global value instead of the per-vCPU value. Cc: Liran Alon Cc: Wanpeng Li Reviewed-by: Liran Alon Cc: stable@vger.kernel.org Fixes: 3b8a5df6c4dc6 ("KVM: LAPIC: Tune lapic_timer_advance_ns automatically") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm/lapic.c') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bbe1402309e3..7decd58c9cea 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -70,7 +70,6 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -static bool lapic_timer_advance_adjust_done = false; #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -1485,6 +1484,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 guest_tsc, tsc_deadline, ns; if (!lapic_in_kernel(vcpu)) @@ -1504,34 +1504,36 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) __delay(min(tsc_deadline - guest_tsc, - nsec_to_cycles(vcpu, lapic_timer_advance_ns))); + nsec_to_cycles(vcpu, timer_advance_ns))); - if (!lapic_timer_advance_adjust_done) { + if (!apic->lapic_timer.timer_advance_adjust_done) { /* too early */ if (guest_tsc < tsc_deadline) { ns = (tsc_deadline - guest_tsc) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns -= min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } else { /* too late */ ns = (guest_tsc - tsc_deadline) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns += min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - lapic_timer_advance_adjust_done = true; - if (unlikely(lapic_timer_advance_ns > 5000)) { - lapic_timer_advance_ns = 0; - lapic_timer_advance_adjust_done = true; + apic->lapic_timer.timer_advance_adjust_done = true; + if (unlikely(timer_advance_ns > 5000)) { + timer_advance_ns = 0; + apic->lapic_timer.timer_advance_adjust_done = true; } + apic->lapic_timer.timer_advance_ns = timer_advance_ns; } } static void start_sw_tscdeadline(struct kvm_lapic *apic) { - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + u64 guest_tsc, tscdeadline = ktimer->tscdeadline; u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; @@ -1551,11 +1553,10 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) do_div(ns, this_tsc_khz); if (likely(tscdeadline > guest_tsc) && - likely(ns > lapic_timer_advance_ns)) { + likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); + expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); + hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -2262,7 +2263,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu) +int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns) { struct kvm_lapic *apic; @@ -2286,6 +2287,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; + apic->lapic_timer.timer_advance_ns = timer_advance_ns; /* * APIC is created enabled. This will prevent kvm_lapic_set_base from -- cgit From c3941d9e0ccd48920e4811f133235b3597e5310b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Apr 2019 10:15:33 -0700 Subject: KVM: lapic: Allow user to disable adaptive tuning of timer advancement The introduction of adaptive tuning of lapic timer advancement did not allow for the scenario where userspace would want to disable adaptive tuning but still employ timer advancement, e.g. for testing purposes or to handle a use case where adaptive tuning is unable to settle on a suitable time. This is epecially pertinent now that KVM places a hard threshold on the maximum advancment time. Rework the timer semantics to accept signed values, with a value of '-1' being interpreted as "use adaptive tuning with KVM's internal default", and any other value being used as an explicit advancement time, e.g. a time of '0' effectively disables advancement. Note, this does not completely restore the original behavior of lapic_timer_advance_ns. Prior to tracking the advancement per vCPU, which is necessary to support autotuning, userspace could adjust lapic_timer_advance_ns for *running* vCPU. With per-vCPU tracking, the module params are snapshotted at vCPU creation, i.e. applying a new advancement effectively requires restarting a VM. Dynamically updating a running vCPU is possible, e.g. a helper could be added to retrieve the desired delay, choosing between the global module param and the per-VCPU value depending on whether or not auto-tuning is (globally) enabled, but introduces a great deal of complexity. The wrapper itself is not complex, but understanding and documenting the effects of dynamically toggling auto-tuning and/or adjusting the timer advancement is nigh impossible since the behavior would be dependent on KVM's implementation as well as compiler optimizations. In other words, providing stable behavior would require extremely careful consideration now and in the future. Given that the expected use of a manually-tuned timer advancement is to "tune once, run many", use the vastly simpler approach of recognizing changes to the module params only when creating a new vCPU. Cc: Liran Alon Cc: Wanpeng Li Reviewed-by: Liran Alon Cc: stable@vger.kernel.org Fixes: 3b8a5df6c4dc6 ("KVM: LAPIC: Tune lapic_timer_advance_ns automatically") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/lapic.c') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 7decd58c9cea..12f2f7dc121f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2263,7 +2263,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns) +int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) { struct kvm_lapic *apic; @@ -2287,7 +2287,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; - apic->lapic_timer.timer_advance_ns = timer_advance_ns; + if (timer_advance_ns == -1) { + apic->lapic_timer.timer_advance_ns = 1000; + apic->lapic_timer.timer_advance_adjust_done = false; + } else { + apic->lapic_timer.timer_advance_ns = timer_advance_ns; + apic->lapic_timer.timer_advance_adjust_done = true; + } + /* * APIC is created enabled. This will prevent kvm_lapic_set_base from -- cgit From b6aa57c69cb26ea0160c51f7cf45f1af23542686 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Apr 2019 10:15:34 -0700 Subject: KVM: lapic: Convert guest TSC to host time domain if necessary To minimize the latency of timer interrupts as observed by the guest, KVM adjusts the values it programs into the host timers to account for the host's overhead of programming and handling the timer event. In the event that the adjustments are too aggressive, i.e. the timer fires earlier than the guest expects, KVM busy waits immediately prior to entering the guest. Currently, KVM manually converts the delay from nanoseconds to clock cycles. But, the conversion is done in the guest's time domain, while the delay occurs in the host's time domain. This is perfectly ok when the guest and host are using the same TSC ratio, but if the guest is using a different ratio then the delay may not be accurate and could wait too little or too long. When the guest is not using the host's ratio, convert the delay from guest clock cycles to host nanoseconds and use ndelay() instead of __delay() to provide more accurate timing. Because converting to nanoseconds is relatively expensive, e.g. requires division and more multiplication ops, continue using __delay() directly when guest and host TSCs are running at the same ratio. Cc: Liran Alon Cc: Wanpeng Li Cc: stable@vger.kernel.org Fixes: 3b8a5df6c4dc6 ("KVM: LAPIC: Tune lapic_timer_advance_ns automatically") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/lapic.c') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 12f2f7dc121f..e0fa6fc0b2d8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1481,6 +1481,26 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) return false; } +static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) +{ + u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns; + + /* + * If the guest TSC is running at a different ratio than the host, then + * convert the delay to nanoseconds to achieve an accurate delay. Note + * that __delay() uses delay_tsc whenever the hardware has TSC, thus + * always for VMX enabled hardware. + */ + if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + __delay(min(guest_cycles, + nsec_to_cycles(vcpu, timer_advance_ns))); + } else { + u64 delay_ns = guest_cycles * 1000000ULL; + do_div(delay_ns, vcpu->arch.virtual_tsc_khz); + ndelay(min_t(u32, delay_ns, timer_advance_ns)); + } +} + void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1501,10 +1521,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) - __delay(min(tsc_deadline - guest_tsc, - nsec_to_cycles(vcpu, timer_advance_ns))); + __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); if (!apic->lapic_timer.timer_advance_adjust_done) { /* too early */ -- cgit From b904cb8dff824b79233e82c078837627ebd52717 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Apr 2019 19:01:09 -0700 Subject: KVM: lapic: Check for in-kernel LAPIC before deferencing apic pointer ...to avoid dereferencing a null pointer when querying the per-vCPU timer advance. Fixes: 39497d7660d98 ("KVM: lapic: Track lapic timer advance per vCPU") Reported-by: syzbot+f7e65445a40d3e0e4ebf@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kvm/lapic.c') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e0fa6fc0b2d8..bd13fdddbdc4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1507,9 +1507,6 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 guest_tsc, tsc_deadline, ns; - if (!lapic_in_kernel(vcpu)) - return; - if (apic->lapic_timer.expired_tscdeadline == 0) return; -- cgit