diff options
Diffstat (limited to 'virt/kvm')
40 files changed, 6387 insertions, 19982 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index ea434ddc8499..267c7369c765 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -1,21 +1,43 @@ # SPDX-License-Identifier: GPL-2.0 # KVM common configuration items and defaults -config HAVE_KVM +config KVM_COMMON bool + select EVENTFD + select INTERVAL_TREE + select PREEMPT_NOTIFIERS -config HAVE_KVM_IRQCHIP +config HAVE_KVM_PFNCACHE bool -config HAVE_KVM_IRQFD +config HAVE_KVM_IRQCHIP bool config HAVE_KVM_IRQ_ROUTING bool -config HAVE_KVM_EVENTFD +config HAVE_KVM_DIRTY_RING bool - select EVENTFD + +# Only strongly ordered architectures can select this, as it doesn't +# put any explicit constraint on userspace ordering. They can also +# select the _ACQ_REL version. +config HAVE_KVM_DIRTY_RING_TSO + bool + select HAVE_KVM_DIRTY_RING + depends on X86 + +# Weakly ordered architectures can only select this, advertising +# to userspace the additional ordering requirements. +config HAVE_KVM_DIRTY_RING_ACQ_REL + bool + select HAVE_KVM_DIRTY_RING + +# Allow enabling both the dirty bitmap and dirty ring. Only architectures +# that need to dirty memory outside of a vCPU context should select this. +config NEED_KVM_DIRTY_RING_WITH_BITMAP + bool + depends on HAVE_KVM_DIRTY_RING config KVM_MMIO bool @@ -30,13 +52,13 @@ config KVM_ASYNC_PF_SYNC config HAVE_KVM_MSI bool -config HAVE_KVM_CPU_RELAX_INTERCEPT +config HAVE_KVM_READONLY_MEM bool -config KVM_VFIO +config HAVE_KVM_CPU_RELAX_INTERCEPT bool -config HAVE_KVM_ARCH_TLB_FLUSH_ALL +config KVM_VFIO bool config HAVE_KVM_INVALID_WAKEUPS @@ -45,15 +67,61 @@ config HAVE_KVM_INVALID_WAKEUPS config KVM_GENERIC_DIRTYLOG_READ_PROTECT bool +config KVM_GENERIC_PRE_FAULT_MEMORY + bool + config KVM_COMPAT def_bool y - depends on KVM && COMPAT && !(S390 || ARM64) + depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) config HAVE_KVM_IRQ_BYPASS + tristate + select IRQ_BYPASS_MANAGER + +config HAVE_KVM_VCPU_RUN_PID_CHANGE bool -config HAVE_KVM_VCPU_ASYNC_IOCTL +config HAVE_KVM_NO_POLL bool -config HAVE_KVM_VCPU_RUN_PID_CHANGE +config VIRT_XFER_TO_GUEST_WORK + bool + +config HAVE_KVM_PM_NOTIFIER + bool + +config KVM_GENERIC_HARDWARE_ENABLING + bool + +config KVM_GENERIC_MMU_NOTIFIER + select MMU_NOTIFIER + bool + +config KVM_ELIDE_TLB_FLUSH_IF_YOUNG + depends on KVM_GENERIC_MMU_NOTIFIER + bool + +config KVM_MMU_LOCKLESS_AGING + depends on KVM_GENERIC_MMU_NOTIFIER + bool + +config KVM_GENERIC_MEMORY_ATTRIBUTES + depends on KVM_GENERIC_MMU_NOTIFIER + bool + +config KVM_GUEST_MEMFD + depends on KVM_GENERIC_MMU_NOTIFIER + select XARRAY_MULTI + bool + +config HAVE_KVM_ARCH_GMEM_PREPARE + bool + depends on KVM_GUEST_MEMFD + +config HAVE_KVM_ARCH_GMEM_INVALIDATE + bool + depends on KVM_GUEST_MEMFD + +config HAVE_KVM_ARCH_GMEM_POPULATE bool + depends on KVM_GUEST_MEMFD diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm new file mode 100644 index 000000000000..d047d4cf58c9 --- /dev/null +++ b/virt/kvm/Makefile.kvm @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Kernel-based Virtual Machine module +# + +KVM ?= ../../../virt/kvm + +kvm-y := $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o +kvm-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o +kvm-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o +kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o +kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o +kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o +kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o +kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c deleted file mode 100644 index 5abbe9b3c652..000000000000 --- a/virt/kvm/arm/aarch32.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * (not much of an) Emulation layer for 32bit guests. - * - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * based on arch/arm/kvm/emulate.c - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/kvm_host.h> -#include <asm/kvm_emulate.h> -#include <asm/kvm_hyp.h> - -/* - * stolen from arch/arm/kernel/opcodes.c - * - * condition code lookup table - * index into the table is test code: EQ, NE, ... LT, GT, AL, NV - * - * bit position in short is condition code: NZCV - */ -static const unsigned short cc_map[16] = { - 0xF0F0, /* EQ == Z set */ - 0x0F0F, /* NE */ - 0xCCCC, /* CS == C set */ - 0x3333, /* CC */ - 0xFF00, /* MI == N set */ - 0x00FF, /* PL */ - 0xAAAA, /* VS == V set */ - 0x5555, /* VC */ - 0x0C0C, /* HI == C set && Z clear */ - 0xF3F3, /* LS == C clear || Z set */ - 0xAA55, /* GE == (N==V) */ - 0x55AA, /* LT == (N!=V) */ - 0x0A05, /* GT == (!Z && (N==V)) */ - 0xF5FA, /* LE == (Z || (N!=V)) */ - 0xFFFF, /* AL always */ - 0 /* NV */ -}; - -/* - * Check if a trapped instruction should have been executed or not. - */ -bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu) -{ - unsigned long cpsr; - u32 cpsr_cond; - int cond; - - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_hsr(vcpu) >> 30) - return true; - - /* Is condition field valid? */ - cond = kvm_vcpu_get_condition(vcpu); - if (cond == 0xE) - return true; - - cpsr = *vcpu_cpsr(vcpu); - - if (cond < 0) { - /* This can happen in Thumb mode: examine IT state. */ - unsigned long it; - - it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); - - /* it == 0 => unconditional. */ - if (it == 0) - return true; - - /* The cond for this insn works out as the top 4 bits. */ - cond = (it >> 4); - } - - cpsr_cond = cpsr >> 28; - - if (!((cc_map[cond] >> cpsr_cond) & 1)) - return false; - - return true; -} - -/** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block - * @vcpu: The VCPU pointer - * - * When exceptions occur while instructions are executed in Thumb IF-THEN - * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have - * to do this little bit of work manually. The fields map like this: - * - * IT[7:0] -> CPSR[26:25],CPSR[15:10] - */ -static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu) -{ - unsigned long itbits, cond; - unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_arm = !(cpsr & PSR_AA32_T_BIT); - - if (is_arm || !(cpsr & PSR_AA32_IT_MASK)) - return; - - cond = (cpsr & 0xe000) >> 13; - itbits = (cpsr & 0x1c00) >> (10 - 2); - itbits |= (cpsr & (0x3 << 25)) >> 25; - - /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ - if ((itbits & 0x7) == 0) - itbits = cond = 0; - else - itbits = (itbits << 1) & 0x1f; - - cpsr &= ~PSR_AA32_IT_MASK; - cpsr |= cond << 13; - cpsr |= (itbits & 0x1c) << (10 - 2); - cpsr |= (itbits & 0x3) << 25; - *vcpu_cpsr(vcpu) = cpsr; -} - -/** - * kvm_skip_instr - skip a trapped instruction and proceed to the next - * @vcpu: The vcpu pointer - */ -void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - bool is_thumb; - - is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT); - if (is_thumb && !is_wide_instr) - *vcpu_pc(vcpu) += 2; - else - *vcpu_pc(vcpu) += 4; - kvm_adjust_itstate(vcpu); -} - -/* - * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. - */ -static const u8 return_offsets[8][2] = { - [0] = { 0, 0 }, /* Reset, unused */ - [1] = { 4, 2 }, /* Undefined */ - [2] = { 0, 0 }, /* SVC, unused */ - [3] = { 4, 4 }, /* Prefetch abort */ - [4] = { 8, 8 }, /* Data abort */ - [5] = { 0, 0 }, /* HVC, unused */ - [6] = { 4, 4 }, /* IRQ, unused */ - [7] = { 4, 4 }, /* FIQ, unused */ -}; - -static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) -{ - unsigned long cpsr; - unsigned long new_spsr_value = *vcpu_cpsr(vcpu); - bool is_thumb = (new_spsr_value & PSR_AA32_T_BIT); - u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - - cpsr = mode | PSR_AA32_I_BIT; - - if (sctlr & (1 << 30)) - cpsr |= PSR_AA32_T_BIT; - if (sctlr & (1 << 25)) - cpsr |= PSR_AA32_E_BIT; - - *vcpu_cpsr(vcpu) = cpsr; - - /* Note: These now point to the banked copies */ - vcpu_write_spsr(vcpu, new_spsr_value); - *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; - - /* Branch to exception vector */ - if (sctlr & (1 << 13)) - vect_offset += 0xffff0000; - else /* always have security exceptions */ - vect_offset += vcpu_cp15(vcpu, c12_VBAR); - - *vcpu_pc(vcpu) = vect_offset; -} - -void kvm_inject_undef32(struct kvm_vcpu *vcpu) -{ - prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4); -} - -/* - * Modelled after TakeDataAbortException() and TakePrefetchAbortException - * pseudocode. - */ -static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, - unsigned long addr) -{ - u32 vect_offset; - u32 *far, *fsr; - bool is_lpae; - - if (is_pabt) { - vect_offset = 12; - far = &vcpu_cp15(vcpu, c6_IFAR); - fsr = &vcpu_cp15(vcpu, c5_IFSR); - } else { /* !iabt */ - vect_offset = 16; - far = &vcpu_cp15(vcpu, c6_DFAR); - fsr = &vcpu_cp15(vcpu, c5_DFSR); - } - - prepare_fault32(vcpu, PSR_AA32_MODE_ABT | PSR_AA32_A_BIT, vect_offset); - - *far = addr; - - /* Give the guest an IMPLEMENTATION DEFINED exception */ - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); - if (is_lpae) - *fsr = 1 << 9 | 0x34; - else - *fsr = 0x14; -} - -void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr) -{ - inject_abt32(vcpu, false, addr); -} - -void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr) -{ - inject_abt32(vcpu, true, addr); -} diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c deleted file mode 100644 index b07ac4614e1c..000000000000 --- a/virt/kvm/arm/arch_timer.c +++ /dev/null @@ -1,943 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <linux/cpu.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/uaccess.h> - -#include <clocksource/arm_arch_timer.h> -#include <asm/arch_timer.h> -#include <asm/kvm_hyp.h> - -#include <kvm/arm_vgic.h> -#include <kvm/arm_arch_timer.h> - -#include "trace.h" - -static struct timecounter *timecounter; -static unsigned int host_vtimer_irq; -static u32 host_vtimer_irq_flags; - -static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); - -static const struct kvm_irq_level default_ptimer_irq = { - .irq = 30, - .level = 1, -}; - -static const struct kvm_irq_level default_vtimer_irq = { - .irq = 27, - .level = 1, -}; - -static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); -static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, - struct arch_timer_context *timer_ctx); -static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); - -u64 kvm_phys_timer_read(void) -{ - return timecounter->cc->read(timecounter->cc); -} - -static inline bool userspace_irqchip(struct kvm *kvm) -{ - return static_branch_unlikely(&userspace_irqchip_in_use) && - unlikely(!irqchip_in_kernel(kvm)); -} - -static void soft_timer_start(struct hrtimer *hrt, u64 ns) -{ - hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), - HRTIMER_MODE_ABS); -} - -static void soft_timer_cancel(struct hrtimer *hrt) -{ - hrtimer_cancel(hrt); -} - -static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) -{ - struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; - struct arch_timer_context *vtimer; - - /* - * We may see a timer interrupt after vcpu_put() has been called which - * sets the CPU's vcpu pointer to NULL, because even though the timer - * has been disabled in vtimer_save_state(), the hardware interrupt - * signal may not have been retired from the interrupt controller yet. - */ - if (!vcpu) - return IRQ_HANDLED; - - vtimer = vcpu_vtimer(vcpu); - if (kvm_timer_should_fire(vtimer)) - kvm_timer_update_irq(vcpu, true, vtimer); - - if (userspace_irqchip(vcpu->kvm) && - !static_branch_unlikely(&has_gic_active_state)) - disable_percpu_irq(host_vtimer_irq); - - return IRQ_HANDLED; -} - -static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) -{ - u64 cval, now; - - cval = timer_ctx->cnt_cval; - now = kvm_phys_timer_read() - timer_ctx->cntvoff; - - if (now < cval) { - u64 ns; - - ns = cyclecounter_cyc2ns(timecounter->cc, - cval - now, - timecounter->mask, - &timecounter->frac); - return ns; - } - - return 0; -} - -static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) -{ - return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && - (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); -} - -/* - * Returns the earliest expiration time in ns among guest timers. - * Note that it will return 0 if none of timers can fire. - */ -static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) -{ - u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - if (kvm_timer_irq_can_fire(vtimer)) - min_virt = kvm_timer_compute_delta(vtimer); - - if (kvm_timer_irq_can_fire(ptimer)) - min_phys = kvm_timer_compute_delta(ptimer); - - /* If none of timers can fire, then return 0 */ - if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX)) - return 0; - - return min(min_virt, min_phys); -} - -static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) -{ - struct arch_timer_cpu *timer; - struct kvm_vcpu *vcpu; - u64 ns; - - timer = container_of(hrt, struct arch_timer_cpu, bg_timer); - vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); - - /* - * Check that the timer has really expired from the guest's - * PoV (NTP on the host may have forced it to expire - * early). If we should have slept longer, restart it. - */ - ns = kvm_timer_earliest_exp(vcpu); - if (unlikely(ns)) { - hrtimer_forward_now(hrt, ns_to_ktime(ns)); - return HRTIMER_RESTART; - } - - kvm_vcpu_wake_up(vcpu); - return HRTIMER_NORESTART; -} - -static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) -{ - struct arch_timer_context *ptimer; - struct arch_timer_cpu *timer; - struct kvm_vcpu *vcpu; - u64 ns; - - timer = container_of(hrt, struct arch_timer_cpu, phys_timer); - vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); - ptimer = vcpu_ptimer(vcpu); - - /* - * Check that the timer has really expired from the guest's - * PoV (NTP on the host may have forced it to expire - * early). If not ready, schedule for a later time. - */ - ns = kvm_timer_compute_delta(ptimer); - if (unlikely(ns)) { - hrtimer_forward_now(hrt, ns_to_ktime(ns)); - return HRTIMER_RESTART; - } - - kvm_timer_update_irq(vcpu, true, ptimer); - return HRTIMER_NORESTART; -} - -static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) -{ - u64 cval, now; - - if (timer_ctx->loaded) { - u32 cnt_ctl; - - /* Only the virtual timer can be loaded so far */ - cnt_ctl = read_sysreg_el0(cntv_ctl); - return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && - (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && - !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); - } - - if (!kvm_timer_irq_can_fire(timer_ctx)) - return false; - - cval = timer_ctx->cnt_cval; - now = kvm_phys_timer_read() - timer_ctx->cntvoff; - - return cval <= now; -} - -bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - if (kvm_timer_should_fire(vtimer)) - return true; - - return kvm_timer_should_fire(ptimer); -} - -/* - * Reflect the timer output level into the kvm_run structure - */ -void kvm_timer_update_run(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct kvm_sync_regs *regs = &vcpu->run->s.regs; - - /* Populate the device bitmap with the timer states */ - regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | - KVM_ARM_DEV_EL1_PTIMER); - if (kvm_timer_should_fire(vtimer)) - regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; - if (kvm_timer_should_fire(ptimer)) - regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; -} - -static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, - struct arch_timer_context *timer_ctx) -{ - int ret; - - timer_ctx->irq.level = new_level; - trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, - timer_ctx->irq.level); - - if (!userspace_irqchip(vcpu->kvm)) { - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - timer_ctx->irq.irq, - timer_ctx->irq.level, - timer_ctx); - WARN_ON(ret); - } -} - -/* Schedule the background timer for the emulated timer. */ -static void phys_timer_emulate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - /* - * If the timer can fire now, we don't need to have a soft timer - * scheduled for the future. If the timer cannot fire at all, - * then we also don't need a soft timer. - */ - if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { - soft_timer_cancel(&timer->phys_timer); - return; - } - - soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer)); -} - -/* - * Check if there was a change in the timer state, so that we should either - * raise or lower the line level to the GIC or schedule a background timer to - * emulate the physical timer. - */ -static void kvm_timer_update_state(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - bool level; - - if (unlikely(!timer->enabled)) - return; - - /* - * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part - * of its lifecycle is offloaded to the hardware, and we therefore may - * not have lowered the irq.level value before having to signal a new - * interrupt, but have to signal an interrupt every time the level is - * asserted. - */ - level = kvm_timer_should_fire(vtimer); - kvm_timer_update_irq(vcpu, level, vtimer); - - phys_timer_emulate(vcpu); - - if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) - kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); -} - -static void vtimer_save_state(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - unsigned long flags; - - local_irq_save(flags); - - if (!vtimer->loaded) - goto out; - - if (timer->enabled) { - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - vtimer->cnt_cval = read_sysreg_el0(cntv_cval); - } - - /* Disable the virtual timer */ - write_sysreg_el0(0, cntv_ctl); - isb(); - - vtimer->loaded = false; -out: - local_irq_restore(flags); -} - -/* - * Schedule the background timer before calling kvm_vcpu_block, so that this - * thread is removed from its waitqueue and made runnable when there's a timer - * interrupt to handle. - */ -void kvm_timer_schedule(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - vtimer_save_state(vcpu); - - /* - * No need to schedule a background timer if any guest timer has - * already expired, because kvm_vcpu_block will return before putting - * the thread to sleep. - */ - if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer)) - return; - - /* - * If both timers are not capable of raising interrupts (disabled or - * masked), then there's no more work for us to do. - */ - if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer)) - return; - - /* - * The guest timers have not yet expired, schedule a background timer. - * Set the earliest expiration time among the guest timers. - */ - soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); -} - -static void vtimer_restore_state(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - unsigned long flags; - - local_irq_save(flags); - - if (vtimer->loaded) - goto out; - - if (timer->enabled) { - write_sysreg_el0(vtimer->cnt_cval, cntv_cval); - isb(); - write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); - } - - vtimer->loaded = true; -out: - local_irq_restore(flags); -} - -void kvm_timer_unschedule(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - vtimer_restore_state(vcpu); - - soft_timer_cancel(&timer->bg_timer); -} - -static void set_cntvoff(u64 cntvoff) -{ - u32 low = lower_32_bits(cntvoff); - u32 high = upper_32_bits(cntvoff); - - /* - * Since kvm_call_hyp doesn't fully support the ARM PCS especially on - * 32-bit systems, but rather passes register by register shifted one - * place (we put the function address in r0/x0), we cannot simply pass - * a 64-bit value as an argument, but have to split the value in two - * 32-bit halves. - */ - kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); -} - -static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active) -{ - int r; - r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active); - WARN_ON(r); -} - -static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - bool phys_active; - - if (irqchip_in_kernel(vcpu->kvm)) - phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); - else - phys_active = vtimer->irq.level; - set_vtimer_irq_phys_active(vcpu, phys_active); -} - -static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - - /* - * When using a userspace irqchip with the architected timers and a - * host interrupt controller that doesn't support an active state, we - * must still prevent continuously exiting from the guest, and - * therefore mask the physical interrupt by disabling it on the host - * interrupt controller when the virtual level is high, such that the - * guest can make forward progress. Once we detect the output level - * being de-asserted, we unmask the interrupt again so that we exit - * from the guest when the timer fires. - */ - if (vtimer->irq.level) - disable_percpu_irq(host_vtimer_irq); - else - enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); -} - -void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - if (unlikely(!timer->enabled)) - return; - - if (static_branch_likely(&has_gic_active_state)) - kvm_timer_vcpu_load_gic(vcpu); - else - kvm_timer_vcpu_load_nogic(vcpu); - - set_cntvoff(vtimer->cntvoff); - - vtimer_restore_state(vcpu); - - /* Set the background timer for the physical timer emulation. */ - phys_timer_emulate(vcpu); - - /* If the timer fired while we weren't running, inject it now */ - if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) - kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); -} - -bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct kvm_sync_regs *sregs = &vcpu->run->s.regs; - bool vlevel, plevel; - - if (likely(irqchip_in_kernel(vcpu->kvm))) - return false; - - vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; - plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; - - return kvm_timer_should_fire(vtimer) != vlevel || - kvm_timer_should_fire(ptimer) != plevel; -} - -void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - if (unlikely(!timer->enabled)) - return; - - vtimer_save_state(vcpu); - - /* - * Cancel the physical timer emulation, because the only case where we - * need it after a vcpu_put is in the context of a sleeping VCPU, and - * in that case we already factor in the deadline for the physical - * timer when scheduling the bg_timer. - * - * In any case, we re-schedule the hrtimer for the physical timer when - * coming back to the VCPU thread in kvm_timer_vcpu_load(). - */ - soft_timer_cancel(&timer->phys_timer); - - /* - * The kernel may decide to run userspace after calling vcpu_put, so - * we reset cntvoff to 0 to ensure a consistent read between user - * accesses to the virtual counter and kernel access to the physical - * counter of non-VHE case. For VHE, the virtual counter uses a fixed - * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. - */ - if (!has_vhe()) - set_cntvoff(0); -} - -/* - * With a userspace irqchip we have to check if the guest de-asserted the - * timer and if so, unmask the timer irq signal on the host interrupt - * controller to ensure that we see future timer signals. - */ -static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - - if (!kvm_timer_should_fire(vtimer)) { - kvm_timer_update_irq(vcpu, false, vtimer); - if (static_branch_likely(&has_gic_active_state)) - set_vtimer_irq_phys_active(vcpu, false); - else - enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); - } -} - -void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - if (unlikely(!timer->enabled)) - return; - - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) - unmask_vtimer_irq_user(vcpu); -} - -int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - /* - * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 - * and to 0 for ARMv7. We provide an implementation that always - * resets the timer to be disabled and unmasked and is compliant with - * the ARMv7 architecture. - */ - vtimer->cnt_ctl = 0; - ptimer->cnt_ctl = 0; - kvm_timer_update_state(vcpu); - - if (timer->enabled && irqchip_in_kernel(vcpu->kvm)) - kvm_vgic_reset_mapped_irq(vcpu, vtimer->irq.irq); - - return 0; -} - -/* Make the updates of cntvoff for all vtimer contexts atomic */ -static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) -{ - int i; - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *tmp; - - mutex_lock(&kvm->lock); - kvm_for_each_vcpu(i, tmp, kvm) - vcpu_vtimer(tmp)->cntvoff = cntvoff; - - /* - * When called from the vcpu create path, the CPU being created is not - * included in the loop above, so we just set it here as well. - */ - vcpu_vtimer(vcpu)->cntvoff = cntvoff; - mutex_unlock(&kvm->lock); -} - -void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - /* Synchronize cntvoff across all vtimers of a VM. */ - update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); - vcpu_ptimer(vcpu)->cntvoff = 0; - - hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - timer->bg_timer.function = kvm_bg_timer_expire; - - hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - timer->phys_timer.function = kvm_phys_timer_expire; - - vtimer->irq.irq = default_vtimer_irq.irq; - ptimer->irq.irq = default_ptimer_irq.irq; -} - -static void kvm_timer_init_interrupt(void *info) -{ - enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); -} - -int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; - break; - case KVM_REG_ARM_TIMER_CNT: - update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); - break; - case KVM_REG_ARM_TIMER_CVAL: - vtimer->cnt_cval = value; - break; - case KVM_REG_ARM_PTIMER_CTL: - ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; - break; - case KVM_REG_ARM_PTIMER_CVAL: - ptimer->cnt_cval = value; - break; - - default: - return -1; - } - - kvm_timer_update_state(vcpu); - return 0; -} - -static u64 read_timer_ctl(struct arch_timer_context *timer) -{ - /* - * Set ISTATUS bit if it's expired. - * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is - * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit - * regardless of ENABLE bit for our implementation convenience. - */ - if (!kvm_timer_compute_delta(timer)) - return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT; - else - return timer->cnt_ctl; -} - -u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) -{ - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - return read_timer_ctl(vtimer); - case KVM_REG_ARM_TIMER_CNT: - return kvm_phys_timer_read() - vtimer->cntvoff; - case KVM_REG_ARM_TIMER_CVAL: - return vtimer->cnt_cval; - case KVM_REG_ARM_PTIMER_CTL: - return read_timer_ctl(ptimer); - case KVM_REG_ARM_PTIMER_CVAL: - return ptimer->cnt_cval; - case KVM_REG_ARM_PTIMER_CNT: - return kvm_phys_timer_read(); - } - return (u64)-1; -} - -static int kvm_timer_starting_cpu(unsigned int cpu) -{ - kvm_timer_init_interrupt(NULL); - return 0; -} - -static int kvm_timer_dying_cpu(unsigned int cpu) -{ - disable_percpu_irq(host_vtimer_irq); - return 0; -} - -int kvm_timer_hyp_init(bool has_gic) -{ - struct arch_timer_kvm_info *info; - int err; - - info = arch_timer_get_kvm_info(); - timecounter = &info->timecounter; - - if (!timecounter->cc) { - kvm_err("kvm_arch_timer: uninitialized timecounter\n"); - return -ENODEV; - } - - if (info->virtual_irq <= 0) { - kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", - info->virtual_irq); - return -ENODEV; - } - host_vtimer_irq = info->virtual_irq; - - host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); - if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && - host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { - kvm_err("Invalid trigger for IRQ%d, assuming level low\n", - host_vtimer_irq); - host_vtimer_irq_flags = IRQF_TRIGGER_LOW; - } - - err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, - "kvm guest timer", kvm_get_running_vcpus()); - if (err) { - kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", - host_vtimer_irq, err); - return err; - } - - if (has_gic) { - err = irq_set_vcpu_affinity(host_vtimer_irq, - kvm_get_running_vcpus()); - if (err) { - kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); - goto out_free_irq; - } - - static_branch_enable(&has_gic_active_state); - } - - kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); - - cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, - "kvm/arm/timer:starting", kvm_timer_starting_cpu, - kvm_timer_dying_cpu); - return 0; -out_free_irq: - free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); - return err; -} - -void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - soft_timer_cancel(&timer->bg_timer); -} - -static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) -{ - int vtimer_irq, ptimer_irq; - int i, ret; - - vtimer_irq = vcpu_vtimer(vcpu)->irq.irq; - ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu)); - if (ret) - return false; - - ptimer_irq = vcpu_ptimer(vcpu)->irq.irq; - ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu)); - if (ret) - return false; - - kvm_for_each_vcpu(i, vcpu, vcpu->kvm) { - if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq || - vcpu_ptimer(vcpu)->irq.irq != ptimer_irq) - return false; - } - - return true; -} - -bool kvm_arch_timer_get_input_level(int vintid) -{ - struct kvm_vcpu *vcpu = kvm_arm_get_running_vcpu(); - struct arch_timer_context *timer; - - if (vintid == vcpu_vtimer(vcpu)->irq.irq) - timer = vcpu_vtimer(vcpu); - else - BUG(); /* We only map the vtimer so far */ - - return kvm_timer_should_fire(timer); -} - -int kvm_timer_enable(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - int ret; - - if (timer->enabled) - return 0; - - /* Without a VGIC we do not map virtual IRQs to physical IRQs */ - if (!irqchip_in_kernel(vcpu->kvm)) - goto no_vgic; - - if (!vgic_initialized(vcpu->kvm)) - return -ENODEV; - - if (!timer_irqs_are_valid(vcpu)) { - kvm_debug("incorrectly configured timer irqs\n"); - return -EINVAL; - } - - ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq, - kvm_arch_timer_get_input_level); - if (ret) - return ret; - -no_vgic: - timer->enabled = 1; - return 0; -} - -/* - * On VHE system, we only need to configure trap on physical timer and counter - * accesses in EL0 and EL1 once, not for every world switch. - * The host kernel runs at EL2 with HCR_EL2.TGE == 1, - * and this makes those bits have no effect for the host kernel execution. - */ -void kvm_timer_init_vhe(void) -{ - /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */ - u32 cnthctl_shift = 10; - u64 val; - - /* - * Disallow physical timer access for the guest. - * Physical counter access is allowed. - */ - val = read_sysreg(cnthctl_el2); - val &= ~(CNTHCTL_EL1PCEN << cnthctl_shift); - val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); - write_sysreg(val, cnthctl_el2); -} - -static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq) -{ - struct kvm_vcpu *vcpu; - int i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu_vtimer(vcpu)->irq.irq = vtimer_irq; - vcpu_ptimer(vcpu)->irq.irq = ptimer_irq; - } -} - -int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - int __user *uaddr = (int __user *)(long)attr->addr; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - int irq; - - if (!irqchip_in_kernel(vcpu->kvm)) - return -EINVAL; - - if (get_user(irq, uaddr)) - return -EFAULT; - - if (!(irq_is_ppi(irq))) - return -EINVAL; - - if (vcpu->arch.timer_cpu.enabled) - return -EBUSY; - - switch (attr->attr) { - case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: - set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq); - break; - case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: - set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq); - break; - default: - return -ENXIO; - } - - return 0; -} - -int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - int __user *uaddr = (int __user *)(long)attr->addr; - struct arch_timer_context *timer; - int irq; - - switch (attr->attr) { - case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: - timer = vcpu_vtimer(vcpu); - break; - case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: - timer = vcpu_ptimer(vcpu); - break; - default: - return -ENXIO; - } - - irq = timer->irq.irq; - return put_user(irq, uaddr); -} - -int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: - case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: - return 0; - } - - return -ENXIO; -} diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c deleted file mode 100644 index 9e350fd34504..000000000000 --- a/virt/kvm/arm/arm.c +++ /dev/null @@ -1,1711 +0,0 @@ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include <linux/bug.h> -#include <linux/cpu_pm.h> -#include <linux/errno.h> -#include <linux/err.h> -#include <linux/kvm_host.h> -#include <linux/list.h> -#include <linux/module.h> -#include <linux/vmalloc.h> -#include <linux/fs.h> -#include <linux/mman.h> -#include <linux/sched.h> -#include <linux/kvm.h> -#include <linux/kvm_irqfd.h> -#include <linux/irqbypass.h> -#include <linux/sched/stat.h> -#include <trace/events/kvm.h> -#include <kvm/arm_pmu.h> -#include <kvm/arm_psci.h> - -#define CREATE_TRACE_POINTS -#include "trace.h" - -#include <linux/uaccess.h> -#include <asm/ptrace.h> -#include <asm/mman.h> -#include <asm/tlbflush.h> -#include <asm/cacheflush.h> -#include <asm/cpufeature.h> -#include <asm/virt.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_mmu.h> -#include <asm/kvm_emulate.h> -#include <asm/kvm_coproc.h> -#include <asm/sections.h> - -#ifdef REQUIRES_VIRT -__asm__(".arch_extension virt"); -#endif - -DEFINE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); -static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); - -/* Per-CPU variable containing the currently running vcpu. */ -static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); - -/* The VMID used in the VTTBR */ -static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); -static u32 kvm_next_vmid; -static unsigned int kvm_vmid_bits __read_mostly; -static DEFINE_SPINLOCK(kvm_vmid_lock); - -static bool vgic_present; - -static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); - -static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) -{ - __this_cpu_write(kvm_arm_running_vcpu, vcpu); -} - -DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); - -/** - * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU. - * Must be called from non-preemptible context - */ -struct kvm_vcpu *kvm_arm_get_running_vcpu(void) -{ - return __this_cpu_read(kvm_arm_running_vcpu); -} - -/** - * kvm_arm_get_running_vcpus - get the per-CPU array of currently running vcpus. - */ -struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) -{ - return &kvm_arm_running_vcpu; -} - -int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; -} - -int kvm_arch_hardware_setup(void) -{ - return 0; -} - -void kvm_arch_check_processor_compat(void *rtn) -{ - *(int *)rtn = 0; -} - - -/** - * kvm_arch_init_vm - initializes a VM data structure - * @kvm: pointer to the KVM struct - */ -int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) -{ - int ret, cpu; - - ret = kvm_arm_setup_stage2(kvm, type); - if (ret) - return ret; - - kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran)); - if (!kvm->arch.last_vcpu_ran) - return -ENOMEM; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1; - - ret = kvm_alloc_stage2_pgd(kvm); - if (ret) - goto out_fail_alloc; - - ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); - if (ret) - goto out_free_stage2_pgd; - - kvm_vgic_early_init(kvm); - - /* Mark the initial VMID generation invalid */ - kvm->arch.vmid_gen = 0; - - /* The maximum number of VCPUs is limited by the host's GIC model */ - kvm->arch.max_vcpus = vgic_present ? - kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; - - return ret; -out_free_stage2_pgd: - kvm_free_stage2_pgd(kvm); -out_fail_alloc: - free_percpu(kvm->arch.last_vcpu_ran); - kvm->arch.last_vcpu_ran = NULL; - return ret; -} - -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - -vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) -{ - return VM_FAULT_SIGBUS; -} - - -/** - * kvm_arch_destroy_vm - destroy the VM data structure - * @kvm: pointer to the KVM struct - */ -void kvm_arch_destroy_vm(struct kvm *kvm) -{ - int i; - - kvm_vgic_destroy(kvm); - - free_percpu(kvm->arch.last_vcpu_ran); - kvm->arch.last_vcpu_ran = NULL; - - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_arch_vcpu_free(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } - atomic_set(&kvm->online_vcpus, 0); -} - -int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) -{ - int r; - switch (ext) { - case KVM_CAP_IRQCHIP: - r = vgic_present; - break; - case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: - case KVM_CAP_USER_MEMORY: - case KVM_CAP_SYNC_MMU: - case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: - case KVM_CAP_ONE_REG: - case KVM_CAP_ARM_PSCI: - case KVM_CAP_ARM_PSCI_0_2: - case KVM_CAP_READONLY_MEM: - case KVM_CAP_MP_STATE: - case KVM_CAP_IMMEDIATE_EXIT: - case KVM_CAP_VCPU_EVENTS: - r = 1; - break; - case KVM_CAP_ARM_SET_DEVICE_ADDR: - r = 1; - break; - case KVM_CAP_NR_VCPUS: - r = num_online_cpus(); - break; - case KVM_CAP_MAX_VCPUS: - r = KVM_MAX_VCPUS; - break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; - case KVM_CAP_MSI_DEVID: - if (!kvm) - r = -EINVAL; - else - r = kvm->arch.vgic.msis_require_devid; - break; - case KVM_CAP_ARM_USER_IRQ: - /* - * 1: EL1_VTIMER, EL1_PTIMER, and PMU. - * (bump this number if adding more devices) - */ - r = 1; - break; - default: - r = kvm_arch_vm_ioctl_check_extension(kvm, ext); - break; - } - return r; -} - -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - return -EINVAL; -} - -struct kvm *kvm_arch_alloc_vm(void) -{ - if (!has_vhe()) - return kzalloc(sizeof(struct kvm), GFP_KERNEL); - - return vzalloc(sizeof(struct kvm)); -} - -void kvm_arch_free_vm(struct kvm *kvm) -{ - if (!has_vhe()) - kfree(kvm); - else - vfree(kvm); -} - -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) -{ - int err; - struct kvm_vcpu *vcpu; - - if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) { - err = -EBUSY; - goto out; - } - - if (id >= kvm->arch.max_vcpus) { - err = -EINVAL; - goto out; - } - - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) { - err = -ENOMEM; - goto out; - } - - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_vcpu; - - err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); - if (err) - goto vcpu_uninit; - - return vcpu; -vcpu_uninit: - kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(err); -} - -void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) -{ -} - -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) - static_branch_dec(&userspace_irqchip_in_use); - - kvm_mmu_free_memory_caches(vcpu); - kvm_timer_vcpu_terminate(vcpu); - kvm_pmu_vcpu_destroy(vcpu); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); -} - -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_free(vcpu); -} - -int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) -{ - return kvm_timer_is_pending(vcpu); -} - -void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) -{ - kvm_timer_schedule(vcpu); - kvm_vgic_v4_enable_doorbell(vcpu); -} - -void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) -{ - kvm_timer_unschedule(vcpu); - kvm_vgic_v4_disable_doorbell(vcpu); -} - -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - /* Force users to call KVM_ARM_VCPU_INIT */ - vcpu->arch.target = -1; - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); - - /* Set up the timer */ - kvm_timer_vcpu_init(vcpu); - - kvm_arm_reset_debug_ptr(vcpu); - - return kvm_vgic_vcpu_init(vcpu); -} - -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - int *last_ran; - - last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran); - - /* - * We might get preempted before the vCPU actually runs, but - * over-invalidation doesn't affect correctness. - */ - if (*last_ran != vcpu->vcpu_id) { - kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu); - *last_ran = vcpu->vcpu_id; - } - - vcpu->cpu = cpu; - vcpu->arch.host_cpu_context = this_cpu_ptr(&kvm_host_cpu_state); - - kvm_arm_set_running_vcpu(vcpu); - kvm_vgic_load(vcpu); - kvm_timer_vcpu_load(vcpu); - kvm_vcpu_load_sysregs(vcpu); - kvm_arch_vcpu_load_fp(vcpu); - - if (single_task_running()) - vcpu_clear_wfe_traps(vcpu); - else - vcpu_set_wfe_traps(vcpu); -} - -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_put_fp(vcpu); - kvm_vcpu_put_sysregs(vcpu); - kvm_timer_vcpu_put(vcpu); - kvm_vgic_put(vcpu); - - vcpu->cpu = -1; - - kvm_arm_set_running_vcpu(NULL); -} - -static void vcpu_power_off(struct kvm_vcpu *vcpu) -{ - vcpu->arch.power_off = true; - kvm_make_request(KVM_REQ_SLEEP, vcpu); - kvm_vcpu_kick(vcpu); -} - -int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - if (vcpu->arch.power_off) - mp_state->mp_state = KVM_MP_STATE_STOPPED; - else - mp_state->mp_state = KVM_MP_STATE_RUNNABLE; - - return 0; -} - -int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - int ret = 0; - - switch (mp_state->mp_state) { - case KVM_MP_STATE_RUNNABLE: - vcpu->arch.power_off = false; - break; - case KVM_MP_STATE_STOPPED: - vcpu_power_off(vcpu); - break; - default: - ret = -EINVAL; - } - - return ret; -} - -/** - * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled - * @v: The VCPU pointer - * - * If the guest CPU is not waiting for interrupts or an interrupt line is - * asserted, the CPU is by definition runnable. - */ -int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) -{ - bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); - return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) - && !v->arch.power_off && !v->arch.pause); -} - -bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) -{ - return vcpu_mode_priv(vcpu); -} - -/* Just ensure a guest exit from a particular CPU */ -static void exit_vm_noop(void *info) -{ -} - -void force_vm_exit(const cpumask_t *mask) -{ - preempt_disable(); - smp_call_function_many(mask, exit_vm_noop, NULL, true); - preempt_enable(); -} - -/** - * need_new_vmid_gen - check that the VMID is still valid - * @kvm: The VM's VMID to check - * - * return true if there is a new generation of VMIDs being used - * - * The hardware supports only 256 values with the value zero reserved for the - * host, so we check if an assigned value belongs to a previous generation, - * which which requires us to assign a new value. If we're the first to use a - * VMID for the new generation, we must flush necessary caches and TLBs on all - * CPUs. - */ -static bool need_new_vmid_gen(struct kvm *kvm) -{ - u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); - smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ - return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen); -} - -/** - * update_vttbr - Update the VTTBR with a valid VMID before the guest runs - * @kvm The guest that we are about to run - * - * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the - * VM has a valid VMID, otherwise assigns a new one and flushes corresponding - * caches and TLBs. - */ -static void update_vttbr(struct kvm *kvm) -{ - phys_addr_t pgd_phys; - u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; - - if (!need_new_vmid_gen(kvm)) - return; - - spin_lock(&kvm_vmid_lock); - - /* - * We need to re-check the vmid_gen here to ensure that if another vcpu - * already allocated a valid vmid for this vm, then this vcpu should - * use the same vmid. - */ - if (!need_new_vmid_gen(kvm)) { - spin_unlock(&kvm_vmid_lock); - return; - } - - /* First user of a new VMID generation? */ - if (unlikely(kvm_next_vmid == 0)) { - atomic64_inc(&kvm_vmid_gen); - kvm_next_vmid = 1; - - /* - * On SMP we know no other CPUs can use this CPU's or each - * other's VMID after force_vm_exit returns since the - * kvm_vmid_lock blocks them from reentry to the guest. - */ - force_vm_exit(cpu_all_mask); - /* - * Now broadcast TLB + ICACHE invalidation over the inner - * shareable domain to make sure all data structures are - * clean. - */ - kvm_call_hyp(__kvm_flush_vm_context); - } - - kvm->arch.vmid = kvm_next_vmid; - kvm_next_vmid++; - kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; - - /* update vttbr to be used with the new vmid */ - pgd_phys = virt_to_phys(kvm->arch.pgd); - BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)); - vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); - kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; - - smp_wmb(); - WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen)); - - spin_unlock(&kvm_vmid_lock); -} - -static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - int ret = 0; - - if (likely(vcpu->arch.has_run_once)) - return 0; - - vcpu->arch.has_run_once = true; - - if (likely(irqchip_in_kernel(kvm))) { - /* - * Map the VGIC hardware resources before running a vcpu the - * first time on this VM. - */ - if (unlikely(!vgic_ready(kvm))) { - ret = kvm_vgic_map_resources(kvm); - if (ret) - return ret; - } - } else { - /* - * Tell the rest of the code that there are userspace irqchip - * VMs in the wild. - */ - static_branch_inc(&userspace_irqchip_in_use); - } - - ret = kvm_timer_enable(vcpu); - if (ret) - return ret; - - ret = kvm_arm_pmu_v3_enable(vcpu); - - return ret; -} - -bool kvm_arch_intc_initialized(struct kvm *kvm) -{ - return vgic_initialized(kvm); -} - -void kvm_arm_halt_guest(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) - vcpu->arch.pause = true; - kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP); -} - -void kvm_arm_resume_guest(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu->arch.pause = false; - swake_up_one(kvm_arch_vcpu_wq(vcpu)); - } -} - -static void vcpu_req_sleep(struct kvm_vcpu *vcpu) -{ - struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); - - swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) && - (!vcpu->arch.pause))); - - if (vcpu->arch.power_off || vcpu->arch.pause) { - /* Awaken to handle a signal, request we sleep again later. */ - kvm_make_request(KVM_REQ_SLEEP, vcpu); - } -} - -static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.target >= 0; -} - -static void check_vcpu_requests(struct kvm_vcpu *vcpu) -{ - if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) - vcpu_req_sleep(vcpu); - - /* - * Clear IRQ_PENDING requests that were made to guarantee - * that a VCPU sees new virtual interrupts. - */ - kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); - } -} - -/** - * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code - * @vcpu: The VCPU pointer - * @run: The kvm_run structure pointer used for userspace state exchange - * - * This function is called through the VCPU_RUN ioctl called from user space. It - * will execute VM code in a loop until the time slice for the process is used - * or some emulation is needed from user space in which case the function will - * return with return value 0 and with the kvm_run structure filled in with the - * required data for the requested emulation. - */ -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - int ret; - - if (unlikely(!kvm_vcpu_initialized(vcpu))) - return -ENOEXEC; - - ret = kvm_vcpu_first_run_init(vcpu); - if (ret) - return ret; - - if (run->exit_reason == KVM_EXIT_MMIO) { - ret = kvm_handle_mmio_return(vcpu, vcpu->run); - if (ret) - return ret; - } - - if (run->immediate_exit) - return -EINTR; - - vcpu_load(vcpu); - - kvm_sigset_activate(vcpu); - - ret = 1; - run->exit_reason = KVM_EXIT_UNKNOWN; - while (ret > 0) { - /* - * Check conditions before entering the guest - */ - cond_resched(); - - update_vttbr(vcpu->kvm); - - check_vcpu_requests(vcpu); - - /* - * Preparing the interrupts to be injected also - * involves poking the GIC, which must be done in a - * non-preemptible context. - */ - preempt_disable(); - - kvm_pmu_flush_hwstate(vcpu); - - local_irq_disable(); - - kvm_vgic_flush_hwstate(vcpu); - - /* - * Exit if we have a signal pending so that we can deliver the - * signal to user space. - */ - if (signal_pending(current)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - - /* - * If we're using a userspace irqchip, then check if we need - * to tell a userspace irqchip about timer or PMU level - * changes and if so, exit to userspace (the actual level - * state gets updated in kvm_timer_update_run and - * kvm_pmu_update_run below). - */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { - if (kvm_timer_should_notify_user(vcpu) || - kvm_pmu_should_notify_user(vcpu)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - } - - /* - * Ensure we set mode to IN_GUEST_MODE after we disable - * interrupts and before the final VCPU requests check. - * See the comment in kvm_vcpu_exiting_guest_mode() and - * Documentation/virtual/kvm/vcpu-requests.rst - */ - smp_store_mb(vcpu->mode, IN_GUEST_MODE); - - if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || - kvm_request_pending(vcpu)) { - vcpu->mode = OUTSIDE_GUEST_MODE; - isb(); /* Ensure work in x_flush_hwstate is committed */ - kvm_pmu_sync_hwstate(vcpu); - if (static_branch_unlikely(&userspace_irqchip_in_use)) - kvm_timer_sync_hwstate(vcpu); - kvm_vgic_sync_hwstate(vcpu); - local_irq_enable(); - preempt_enable(); - continue; - } - - kvm_arm_setup_debug(vcpu); - - /************************************************************** - * Enter the guest - */ - trace_kvm_entry(*vcpu_pc(vcpu)); - guest_enter_irqoff(); - - if (has_vhe()) { - kvm_arm_vhe_guest_enter(); - ret = kvm_vcpu_run_vhe(vcpu); - kvm_arm_vhe_guest_exit(); - } else { - ret = kvm_call_hyp(__kvm_vcpu_run_nvhe, vcpu); - } - - vcpu->mode = OUTSIDE_GUEST_MODE; - vcpu->stat.exits++; - /* - * Back from guest - *************************************************************/ - - kvm_arm_clear_debug(vcpu); - - /* - * We must sync the PMU state before the vgic state so - * that the vgic can properly sample the updated state of the - * interrupt line. - */ - kvm_pmu_sync_hwstate(vcpu); - - /* - * Sync the vgic state before syncing the timer state because - * the timer code needs to know if the virtual timer - * interrupts are active. - */ - kvm_vgic_sync_hwstate(vcpu); - - /* - * Sync the timer hardware state before enabling interrupts as - * we don't want vtimer interrupts to race with syncing the - * timer virtual interrupt state. - */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) - kvm_timer_sync_hwstate(vcpu); - - kvm_arch_vcpu_ctxsync_fp(vcpu); - - /* - * We may have taken a host interrupt in HYP mode (ie - * while executing the guest). This interrupt is still - * pending, as we haven't serviced it yet! - * - * We're now back in SVC mode, with interrupts - * disabled. Enabling the interrupts now will have - * the effect of taking the interrupt again, in SVC - * mode this time. - */ - local_irq_enable(); - - /* - * We do local_irq_enable() before calling guest_exit() so - * that if a timer interrupt hits while running the guest we - * account that tick as being spent in the guest. We enable - * preemption after calling guest_exit() so that if we get - * preempted we make sure ticks after that is not counted as - * guest time. - */ - guest_exit(); - trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); - - /* Exit types that need handling before we can be preempted */ - handle_exit_early(vcpu, run, ret); - - preempt_enable(); - - ret = handle_exit(vcpu, run, ret); - } - - /* Tell userspace about in-kernel device output levels */ - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { - kvm_timer_update_run(vcpu); - kvm_pmu_update_run(vcpu); - } - - kvm_sigset_deactivate(vcpu); - - vcpu_put(vcpu); - return ret; -} - -static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) -{ - int bit_index; - bool set; - unsigned long *hcr; - - if (number == KVM_ARM_IRQ_CPU_IRQ) - bit_index = __ffs(HCR_VI); - else /* KVM_ARM_IRQ_CPU_FIQ */ - bit_index = __ffs(HCR_VF); - - hcr = vcpu_hcr(vcpu); - if (level) - set = test_and_set_bit(bit_index, hcr); - else - set = test_and_clear_bit(bit_index, hcr); - - /* - * If we didn't change anything, no need to wake up or kick other CPUs - */ - if (set == level) - return 0; - - /* - * The vcpu irq_lines field was updated, wake up sleeping VCPUs and - * trigger a world-switch round on the running physical CPU to set the - * virtual IRQ/FIQ fields in the HCR appropriately. - */ - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - - return 0; -} - -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, - bool line_status) -{ - u32 irq = irq_level->irq; - unsigned int irq_type, vcpu_idx, irq_num; - int nrcpus = atomic_read(&kvm->online_vcpus); - struct kvm_vcpu *vcpu = NULL; - bool level = irq_level->level; - - irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; - vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; - irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; - - trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level); - - switch (irq_type) { - case KVM_ARM_IRQ_TYPE_CPU: - if (irqchip_in_kernel(kvm)) - return -ENXIO; - - if (vcpu_idx >= nrcpus) - return -EINVAL; - - vcpu = kvm_get_vcpu(kvm, vcpu_idx); - if (!vcpu) - return -EINVAL; - - if (irq_num > KVM_ARM_IRQ_CPU_FIQ) - return -EINVAL; - - return vcpu_interrupt_line(vcpu, irq_num, level); - case KVM_ARM_IRQ_TYPE_PPI: - if (!irqchip_in_kernel(kvm)) - return -ENXIO; - - if (vcpu_idx >= nrcpus) - return -EINVAL; - - vcpu = kvm_get_vcpu(kvm, vcpu_idx); - if (!vcpu) - return -EINVAL; - - if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) - return -EINVAL; - - return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL); - case KVM_ARM_IRQ_TYPE_SPI: - if (!irqchip_in_kernel(kvm)) - return -ENXIO; - - if (irq_num < VGIC_NR_PRIVATE_IRQS) - return -EINVAL; - - return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL); - } - - return -EINVAL; -} - -static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, - const struct kvm_vcpu_init *init) -{ - unsigned int i; - int phys_target = kvm_target_cpu(); - - if (init->target != phys_target) - return -EINVAL; - - /* - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must - * use the same target. - */ - if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) - return -EINVAL; - - /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ - for (i = 0; i < sizeof(init->features) * 8; i++) { - bool set = (init->features[i / 32] & (1 << (i % 32))); - - if (set && i >= KVM_VCPU_MAX_FEATURES) - return -ENOENT; - - /* - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must - * use the same feature set. - */ - if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && - test_bit(i, vcpu->arch.features) != set) - return -EINVAL; - - if (set) - set_bit(i, vcpu->arch.features); - } - - vcpu->arch.target = phys_target; - - /* Now we know what it is, we can reset it. */ - return kvm_reset_vcpu(vcpu); -} - - -static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, - struct kvm_vcpu_init *init) -{ - int ret; - - ret = kvm_vcpu_set_target(vcpu, init); - if (ret) - return ret; - - /* - * Ensure a rebooted VM will fault in RAM pages and detect if the - * guest MMU is turned off and flush the caches as needed. - */ - if (vcpu->arch.has_run_once) - stage2_unmap_vm(vcpu->kvm); - - vcpu_reset_hcr(vcpu); - - /* - * Handle the "start in power-off" case. - */ - if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - vcpu_power_off(vcpu); - else - vcpu->arch.power_off = false; - - return 0; -} - -static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - int ret = -ENXIO; - - switch (attr->group) { - default: - ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr); - break; - } - - return ret; -} - -static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - int ret = -ENXIO; - - switch (attr->group) { - default: - ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr); - break; - } - - return ret; -} - -static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - int ret = -ENXIO; - - switch (attr->group) { - default: - ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr); - break; - } - - return ret; -} - -static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - memset(events, 0, sizeof(*events)); - - return __kvm_arm_vcpu_get_events(vcpu, events); -} - -static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - int i; - - /* check whether the reserved field is zero */ - for (i = 0; i < ARRAY_SIZE(events->reserved); i++) - if (events->reserved[i]) - return -EINVAL; - - /* check whether the pad field is zero */ - for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) - if (events->exception.pad[i]) - return -EINVAL; - - return __kvm_arm_vcpu_set_events(vcpu, events); -} - -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; - struct kvm_device_attr attr; - long r; - - switch (ioctl) { - case KVM_ARM_VCPU_INIT: { - struct kvm_vcpu_init init; - - r = -EFAULT; - if (copy_from_user(&init, argp, sizeof(init))) - break; - - r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); - break; - } - case KVM_SET_ONE_REG: - case KVM_GET_ONE_REG: { - struct kvm_one_reg reg; - - r = -ENOEXEC; - if (unlikely(!kvm_vcpu_initialized(vcpu))) - break; - - r = -EFAULT; - if (copy_from_user(®, argp, sizeof(reg))) - break; - - if (ioctl == KVM_SET_ONE_REG) - r = kvm_arm_set_reg(vcpu, ®); - else - r = kvm_arm_get_reg(vcpu, ®); - break; - } - case KVM_GET_REG_LIST: { - struct kvm_reg_list __user *user_list = argp; - struct kvm_reg_list reg_list; - unsigned n; - - r = -ENOEXEC; - if (unlikely(!kvm_vcpu_initialized(vcpu))) - break; - - r = -EFAULT; - if (copy_from_user(®_list, user_list, sizeof(reg_list))) - break; - n = reg_list.n; - reg_list.n = kvm_arm_num_regs(vcpu); - if (copy_to_user(user_list, ®_list, sizeof(reg_list))) - break; - r = -E2BIG; - if (n < reg_list.n) - break; - r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); - break; - } - case KVM_SET_DEVICE_ATTR: { - r = -EFAULT; - if (copy_from_user(&attr, argp, sizeof(attr))) - break; - r = kvm_arm_vcpu_set_attr(vcpu, &attr); - break; - } - case KVM_GET_DEVICE_ATTR: { - r = -EFAULT; - if (copy_from_user(&attr, argp, sizeof(attr))) - break; - r = kvm_arm_vcpu_get_attr(vcpu, &attr); - break; - } - case KVM_HAS_DEVICE_ATTR: { - r = -EFAULT; - if (copy_from_user(&attr, argp, sizeof(attr))) - break; - r = kvm_arm_vcpu_has_attr(vcpu, &attr); - break; - } - case KVM_GET_VCPU_EVENTS: { - struct kvm_vcpu_events events; - - if (kvm_arm_vcpu_get_events(vcpu, &events)) - return -EINVAL; - - if (copy_to_user(argp, &events, sizeof(events))) - return -EFAULT; - - return 0; - } - case KVM_SET_VCPU_EVENTS: { - struct kvm_vcpu_events events; - - if (copy_from_user(&events, argp, sizeof(events))) - return -EFAULT; - - return kvm_arm_vcpu_set_events(vcpu, &events); - } - default: - r = -EINVAL; - } - - return r; -} - -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * Steps 1-4 below provide general overview of dirty page logging. See - * kvm_get_dirty_log_protect() function description for additional details. - * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Copy the snapshot to the userspace. - * 4. Flush TLB's if needed. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - r = kvm_get_dirty_log_protect(kvm, log, &flush); - - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; -} - -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) -{ - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - r = kvm_clear_dirty_log_protect(kvm, log, &flush); - - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; -} - -static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, - struct kvm_arm_device_addr *dev_addr) -{ - unsigned long dev_id, type; - - dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >> - KVM_ARM_DEVICE_ID_SHIFT; - type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >> - KVM_ARM_DEVICE_TYPE_SHIFT; - - switch (dev_id) { - case KVM_ARM_DEVICE_VGIC_V2: - if (!vgic_present) - return -ENXIO; - return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); - default: - return -ENODEV; - } -} - -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; - - switch (ioctl) { - case KVM_CREATE_IRQCHIP: { - int ret; - if (!vgic_present) - return -ENXIO; - mutex_lock(&kvm->lock); - ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); - mutex_unlock(&kvm->lock); - return ret; - } - case KVM_ARM_SET_DEVICE_ADDR: { - struct kvm_arm_device_addr dev_addr; - - if (copy_from_user(&dev_addr, argp, sizeof(dev_addr))) - return -EFAULT; - return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); - } - case KVM_ARM_PREFERRED_TARGET: { - int err; - struct kvm_vcpu_init init; - - err = kvm_vcpu_preferred_target(&init); - if (err) - return err; - - if (copy_to_user(argp, &init, sizeof(init))) - return -EFAULT; - - return 0; - } - default: - return -EINVAL; - } -} - -static void cpu_init_hyp_mode(void *dummy) -{ - phys_addr_t pgd_ptr; - unsigned long hyp_stack_ptr; - unsigned long stack_page; - unsigned long vector_ptr; - - /* Switch from the HYP stub to our own HYP init vector */ - __hyp_set_vectors(kvm_get_idmap_vector()); - - pgd_ptr = kvm_mmu_get_httbr(); - stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); - hyp_stack_ptr = stack_page + PAGE_SIZE; - vector_ptr = (unsigned long)kvm_get_hyp_vector(); - - __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); - __cpu_init_stage2(); -} - -static void cpu_hyp_reset(void) -{ - if (!is_kernel_in_hyp_mode()) - __hyp_reset_vectors(); -} - -static void cpu_hyp_reinit(void) -{ - cpu_hyp_reset(); - - if (is_kernel_in_hyp_mode()) - kvm_timer_init_vhe(); - else - cpu_init_hyp_mode(NULL); - - kvm_arm_init_debug(); - - if (vgic_present) - kvm_vgic_init_cpu_hardware(); -} - -static void _kvm_arch_hardware_enable(void *discard) -{ - if (!__this_cpu_read(kvm_arm_hardware_enabled)) { - cpu_hyp_reinit(); - __this_cpu_write(kvm_arm_hardware_enabled, 1); - } -} - -int kvm_arch_hardware_enable(void) -{ - _kvm_arch_hardware_enable(NULL); - return 0; -} - -static void _kvm_arch_hardware_disable(void *discard) -{ - if (__this_cpu_read(kvm_arm_hardware_enabled)) { - cpu_hyp_reset(); - __this_cpu_write(kvm_arm_hardware_enabled, 0); - } -} - -void kvm_arch_hardware_disable(void) -{ - _kvm_arch_hardware_disable(NULL); -} - -#ifdef CONFIG_CPU_PM -static int hyp_init_cpu_pm_notifier(struct notifier_block *self, - unsigned long cmd, - void *v) -{ - /* - * kvm_arm_hardware_enabled is left with its old value over - * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should - * re-enable hyp. - */ - switch (cmd) { - case CPU_PM_ENTER: - if (__this_cpu_read(kvm_arm_hardware_enabled)) - /* - * don't update kvm_arm_hardware_enabled here - * so that the hardware will be re-enabled - * when we resume. See below. - */ - cpu_hyp_reset(); - - return NOTIFY_OK; - case CPU_PM_ENTER_FAILED: - case CPU_PM_EXIT: - if (__this_cpu_read(kvm_arm_hardware_enabled)) - /* The hardware was enabled before suspend. */ - cpu_hyp_reinit(); - - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -static struct notifier_block hyp_init_cpu_pm_nb = { - .notifier_call = hyp_init_cpu_pm_notifier, -}; - -static void __init hyp_cpu_pm_init(void) -{ - cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); -} -static void __init hyp_cpu_pm_exit(void) -{ - cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); -} -#else -static inline void hyp_cpu_pm_init(void) -{ -} -static inline void hyp_cpu_pm_exit(void) -{ -} -#endif - -static int init_common_resources(void) -{ - /* set size of VMID supported by CPU */ - kvm_vmid_bits = kvm_get_vmid_bits(); - kvm_info("%d-bit VMID\n", kvm_vmid_bits); - - kvm_set_ipa_limit(); - - return 0; -} - -static int init_subsystems(void) -{ - int err = 0; - - /* - * Enable hardware so that subsystem initialisation can access EL2. - */ - on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); - - /* - * Register CPU lower-power notifier - */ - hyp_cpu_pm_init(); - - /* - * Init HYP view of VGIC - */ - err = kvm_vgic_hyp_init(); - switch (err) { - case 0: - vgic_present = true; - break; - case -ENODEV: - case -ENXIO: - vgic_present = false; - err = 0; - break; - default: - goto out; - } - - /* - * Init HYP architected timer support - */ - err = kvm_timer_hyp_init(vgic_present); - if (err) - goto out; - - kvm_perf_init(); - kvm_coproc_table_init(); - -out: - on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); - - return err; -} - -static void teardown_hyp_mode(void) -{ - int cpu; - - free_hyp_pgds(); - for_each_possible_cpu(cpu) - free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); - hyp_cpu_pm_exit(); -} - -/** - * Inits Hyp-mode on all online CPUs - */ -static int init_hyp_mode(void) -{ - int cpu; - int err = 0; - - /* - * Allocate Hyp PGD and setup Hyp identity mapping - */ - err = kvm_mmu_init(); - if (err) - goto out_err; - - /* - * Allocate stack pages for Hypervisor-mode - */ - for_each_possible_cpu(cpu) { - unsigned long stack_page; - - stack_page = __get_free_page(GFP_KERNEL); - if (!stack_page) { - err = -ENOMEM; - goto out_err; - } - - per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; - } - - /* - * Map the Hyp-code called directly from the host - */ - err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), - kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); - if (err) { - kvm_err("Cannot map world-switch code\n"); - goto out_err; - } - - err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), - kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); - if (err) { - kvm_err("Cannot map rodata section\n"); - goto out_err; - } - - err = create_hyp_mappings(kvm_ksym_ref(__bss_start), - kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); - if (err) { - kvm_err("Cannot map bss section\n"); - goto out_err; - } - - err = kvm_map_vectors(); - if (err) { - kvm_err("Cannot map vectors\n"); - goto out_err; - } - - /* - * Map the Hyp stack pages - */ - for_each_possible_cpu(cpu) { - char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); - err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, - PAGE_HYP); - - if (err) { - kvm_err("Cannot map hyp stack\n"); - goto out_err; - } - } - - for_each_possible_cpu(cpu) { - kvm_cpu_context_t *cpu_ctxt; - - cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu); - err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP); - - if (err) { - kvm_err("Cannot map host CPU state: %d\n", err); - goto out_err; - } - } - - err = hyp_map_aux_data(); - if (err) - kvm_err("Cannot map host auxilary data: %d\n", err); - - return 0; - -out_err: - teardown_hyp_mode(); - kvm_err("error initializing Hyp mode: %d\n", err); - return err; -} - -static void check_kvm_target_cpu(void *ret) -{ - *(int *)ret = kvm_target_cpu(); -} - -struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) -{ - struct kvm_vcpu *vcpu; - int i; - - mpidr &= MPIDR_HWID_BITMASK; - kvm_for_each_vcpu(i, vcpu, kvm) { - if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) - return vcpu; - } - return NULL; -} - -bool kvm_arch_has_irq_bypass(void) -{ - return true; -} - -int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, - &irqfd->irq_entry); -} -void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, - &irqfd->irq_entry); -} - -void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - kvm_arm_halt_guest(irqfd->kvm); -} - -void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - kvm_arm_resume_guest(irqfd->kvm); -} - -/** - * Initialize Hyp-mode and memory mappings on all CPUs. - */ -int kvm_arch_init(void *opaque) -{ - int err; - int ret, cpu; - bool in_hyp_mode; - - if (!is_hyp_mode_available()) { - kvm_info("HYP mode not available\n"); - return -ENODEV; - } - - in_hyp_mode = is_kernel_in_hyp_mode(); - - if (!in_hyp_mode && kvm_arch_requires_vhe()) { - kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n"); - return -ENODEV; - } - - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); - if (ret < 0) { - kvm_err("Error, CPU %d not supported!\n", cpu); - return -ENODEV; - } - } - - err = init_common_resources(); - if (err) - return err; - - if (!in_hyp_mode) { - err = init_hyp_mode(); - if (err) - goto out_err; - } - - err = init_subsystems(); - if (err) - goto out_hyp; - - if (in_hyp_mode) - kvm_info("VHE mode initialized successfully\n"); - else - kvm_info("Hyp mode initialized successfully\n"); - - return 0; - -out_hyp: - if (!in_hyp_mode) - teardown_hyp_mode(); -out_err: - return err; -} - -/* NOP: Compiling as a module not supported */ -void kvm_arch_exit(void) -{ - kvm_perf_teardown(); -} - -static int arm_init(void) -{ - int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); - return rc; -} - -module_init(arm_init); diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c deleted file mode 100644 index 77754a62eb0c..000000000000 --- a/virt/kvm/arm/hyp/timer-sr.c +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (C) 2012-2015 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <clocksource/arm_arch_timer.h> -#include <linux/compiler.h> -#include <linux/kvm_host.h> - -#include <asm/kvm_hyp.h> - -void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high) -{ - u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low; - write_sysreg(cntvoff, cntvoff_el2); -} - -/* - * Should only be called on non-VHE systems. - * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). - */ -void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu) -{ - u64 val; - - /* Allow physical timer/counter access for the host */ - val = read_sysreg(cnthctl_el2); - val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; - write_sysreg(val, cnthctl_el2); -} - -/* - * Should only be called on non-VHE systems. - * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). - */ -void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu) -{ - u64 val; - - /* - * Disallow physical timer access for the guest - * Physical counter access is allowed - */ - val = read_sysreg(cnthctl_el2); - val &= ~CNTHCTL_EL1PCEN; - val |= CNTHCTL_EL1PCTEN; - write_sysreg(val, cnthctl_el2); -} diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c deleted file mode 100644 index 9652c453480f..000000000000 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ /dev/null @@ -1,1133 +0,0 @@ -/* - * Copyright (C) 2012-2015 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/compiler.h> -#include <linux/irqchip/arm-gic-v3.h> -#include <linux/kvm_host.h> - -#include <asm/kvm_emulate.h> -#include <asm/kvm_hyp.h> -#include <asm/kvm_mmu.h> - -#define vtr_to_max_lr_idx(v) ((v) & 0xf) -#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) -#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) - -static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) -{ - switch (lr & 0xf) { - case 0: - return read_gicreg(ICH_LR0_EL2); - case 1: - return read_gicreg(ICH_LR1_EL2); - case 2: - return read_gicreg(ICH_LR2_EL2); - case 3: - return read_gicreg(ICH_LR3_EL2); - case 4: - return read_gicreg(ICH_LR4_EL2); - case 5: - return read_gicreg(ICH_LR5_EL2); - case 6: - return read_gicreg(ICH_LR6_EL2); - case 7: - return read_gicreg(ICH_LR7_EL2); - case 8: - return read_gicreg(ICH_LR8_EL2); - case 9: - return read_gicreg(ICH_LR9_EL2); - case 10: - return read_gicreg(ICH_LR10_EL2); - case 11: - return read_gicreg(ICH_LR11_EL2); - case 12: - return read_gicreg(ICH_LR12_EL2); - case 13: - return read_gicreg(ICH_LR13_EL2); - case 14: - return read_gicreg(ICH_LR14_EL2); - case 15: - return read_gicreg(ICH_LR15_EL2); - } - - unreachable(); -} - -static void __hyp_text __gic_v3_set_lr(u64 val, int lr) -{ - switch (lr & 0xf) { - case 0: - write_gicreg(val, ICH_LR0_EL2); - break; - case 1: - write_gicreg(val, ICH_LR1_EL2); - break; - case 2: - write_gicreg(val, ICH_LR2_EL2); - break; - case 3: - write_gicreg(val, ICH_LR3_EL2); - break; - case 4: - write_gicreg(val, ICH_LR4_EL2); - break; - case 5: - write_gicreg(val, ICH_LR5_EL2); - break; - case 6: - write_gicreg(val, ICH_LR6_EL2); - break; - case 7: - write_gicreg(val, ICH_LR7_EL2); - break; - case 8: - write_gicreg(val, ICH_LR8_EL2); - break; - case 9: - write_gicreg(val, ICH_LR9_EL2); - break; - case 10: - write_gicreg(val, ICH_LR10_EL2); - break; - case 11: - write_gicreg(val, ICH_LR11_EL2); - break; - case 12: - write_gicreg(val, ICH_LR12_EL2); - break; - case 13: - write_gicreg(val, ICH_LR13_EL2); - break; - case 14: - write_gicreg(val, ICH_LR14_EL2); - break; - case 15: - write_gicreg(val, ICH_LR15_EL2); - break; - } -} - -static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n) -{ - switch (n) { - case 0: - write_gicreg(val, ICH_AP0R0_EL2); - break; - case 1: - write_gicreg(val, ICH_AP0R1_EL2); - break; - case 2: - write_gicreg(val, ICH_AP0R2_EL2); - break; - case 3: - write_gicreg(val, ICH_AP0R3_EL2); - break; - } -} - -static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n) -{ - switch (n) { - case 0: - write_gicreg(val, ICH_AP1R0_EL2); - break; - case 1: - write_gicreg(val, ICH_AP1R1_EL2); - break; - case 2: - write_gicreg(val, ICH_AP1R2_EL2); - break; - case 3: - write_gicreg(val, ICH_AP1R3_EL2); - break; - } -} - -static u32 __hyp_text __vgic_v3_read_ap0rn(int n) -{ - u32 val; - - switch (n) { - case 0: - val = read_gicreg(ICH_AP0R0_EL2); - break; - case 1: - val = read_gicreg(ICH_AP0R1_EL2); - break; - case 2: - val = read_gicreg(ICH_AP0R2_EL2); - break; - case 3: - val = read_gicreg(ICH_AP0R3_EL2); - break; - default: - unreachable(); - } - - return val; -} - -static u32 __hyp_text __vgic_v3_read_ap1rn(int n) -{ - u32 val; - - switch (n) { - case 0: - val = read_gicreg(ICH_AP1R0_EL2); - break; - case 1: - val = read_gicreg(ICH_AP1R1_EL2); - break; - case 2: - val = read_gicreg(ICH_AP1R2_EL2); - break; - case 3: - val = read_gicreg(ICH_AP1R3_EL2); - break; - default: - unreachable(); - } - - return val; -} - -void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - - /* - * Make sure stores to the GIC via the memory mapped interface - * are now visible to the system register interface when reading the - * LRs, and when reading back the VMCR on non-VHE systems. - */ - if (used_lrs || !has_vhe()) { - if (!cpu_if->vgic_sre) { - dsb(sy); - isb(); - } - } - - if (used_lrs) { - int i; - u32 elrsr; - - elrsr = read_gicreg(ICH_ELSR_EL2); - - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); - - for (i = 0; i < used_lrs; i++) { - if (elrsr & (1 << i)) - cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; - else - cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); - - __gic_v3_set_lr(0, i); - } - } -} - -void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - int i; - - if (used_lrs) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); - - for (i = 0; i < used_lrs; i++) - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } - - /* - * Ensure that writes to the LRs, and on non-VHE systems ensure that - * the write to the VMCR in __vgic_v3_activate_traps(), will have - * reached the (re)distributors. This ensure the guest will read the - * correct values from the memory-mapped interface. - */ - if (used_lrs || !has_vhe()) { - if (!cpu_if->vgic_sre) { - isb(); - dsb(sy); - } - } -} - -void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - /* - * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a - * Group0 interrupt (as generated in GICv2 mode) to be - * delivered as a FIQ to the guest, with potentially fatal - * consequences. So we must make sure that ICC_SRE_EL1 has - * been actually programmed with the value we want before - * starting to mess with the rest of the GIC, and VMCR_EL2 in - * particular. This logic must be called before - * __vgic_v3_restore_state(). - */ - if (!cpu_if->vgic_sre) { - write_gicreg(0, ICC_SRE_EL1); - isb(); - write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); - - - if (has_vhe()) { - /* - * Ensure that the write to the VMCR will have reached - * the (re)distributors. This ensure the guest will - * read the correct values from the memory-mapped - * interface. - */ - isb(); - dsb(sy); - } - } - - /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. - */ - write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, - ICC_SRE_EL2); - - /* - * If we need to trap system registers, we must write - * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected, - */ - if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); -} - -void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val; - - if (!cpu_if->vgic_sre) { - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - } - - val = read_gicreg(ICC_SRE_EL2); - write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); - - if (!cpu_if->vgic_sre) { - /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ - isb(); - write_gicreg(1, ICC_SRE_EL1); - } - - /* - * If we were trapping system registers, we enabled the VGIC even if - * no interrupts were being injected, and we disable it again here. - */ - if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) - write_gicreg(0, ICH_HCR_EL2); -} - -void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if; - u64 val; - u32 nr_pre_bits; - - vcpu = kern_hyp_va(vcpu); - cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - val = read_gicreg(ICH_VTR_EL2); - nr_pre_bits = vtr_to_nr_pre_bits(val); - - switch (nr_pre_bits) { - case 7: - cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3); - cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); - case 6: - cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1); - default: - cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0); - } - - switch (nr_pre_bits) { - case 7: - cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3); - cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2); - case 6: - cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1); - default: - cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0); - } -} - -void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if; - u64 val; - u32 nr_pre_bits; - - vcpu = kern_hyp_va(vcpu); - cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - val = read_gicreg(ICH_VTR_EL2); - nr_pre_bits = vtr_to_nr_pre_bits(val); - - switch (nr_pre_bits) { - case 7: - __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3); - __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2); - case 6: - __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1); - default: - __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0); - } - - switch (nr_pre_bits) { - case 7: - __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3); - __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2); - case 6: - __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1); - default: - __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0); - } -} - -void __hyp_text __vgic_v3_init_lrs(void) -{ - int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); - int i; - - for (i = 0; i <= max_lr_idx; i++) - __gic_v3_set_lr(0, i); -} - -u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) -{ - return read_gicreg(ICH_VTR_EL2); -} - -u64 __hyp_text __vgic_v3_read_vmcr(void) -{ - return read_gicreg(ICH_VMCR_EL2); -} - -void __hyp_text __vgic_v3_write_vmcr(u32 vmcr) -{ - write_gicreg(vmcr, ICH_VMCR_EL2); -} - -#ifdef CONFIG_ARM64 - -static int __hyp_text __vgic_v3_bpr_min(void) -{ - /* See Pseudocode for VPriorityGroup */ - return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2)); -} - -static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu) -{ - u32 esr = kvm_vcpu_get_hsr(vcpu); - u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; - - return crm != 8; -} - -#define GICv3_IDLE_PRIORITY 0xff - -static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, - u32 vmcr, - u64 *lr_val) -{ - unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; - u8 priority = GICv3_IDLE_PRIORITY; - int i, lr = -1; - - for (i = 0; i < used_lrs; i++) { - u64 val = __gic_v3_get_lr(i); - u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; - - /* Not pending in the state? */ - if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT) - continue; - - /* Group-0 interrupt, but Group-0 disabled? */ - if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK)) - continue; - - /* Group-1 interrupt, but Group-1 disabled? */ - if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK)) - continue; - - /* Not the highest priority? */ - if (lr_prio >= priority) - continue; - - /* This is a candidate */ - priority = lr_prio; - *lr_val = val; - lr = i; - } - - if (lr == -1) - *lr_val = ICC_IAR1_EL1_SPURIOUS; - - return lr; -} - -static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, - int intid, u64 *lr_val) -{ - unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; - int i; - - for (i = 0; i < used_lrs; i++) { - u64 val = __gic_v3_get_lr(i); - - if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid && - (val & ICH_LR_ACTIVE_BIT)) { - *lr_val = val; - return i; - } - } - - *lr_val = ICC_IAR1_EL1_SPURIOUS; - return -1; -} - -static int __hyp_text __vgic_v3_get_highest_active_priority(void) -{ - u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); - u32 hap = 0; - int i; - - for (i = 0; i < nr_apr_regs; i++) { - u32 val; - - /* - * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers - * contain the active priority levels for this VCPU - * for the maximum number of supported priority - * levels, and we return the full priority level only - * if the BPR is programmed to its minimum, otherwise - * we return a combination of the priority level and - * subpriority, as determined by the setting of the - * BPR, but without the full subpriority. - */ - val = __vgic_v3_read_ap0rn(i); - val |= __vgic_v3_read_ap1rn(i); - if (!val) { - hap += 32; - continue; - } - - return (hap + __ffs(val)) << __vgic_v3_bpr_min(); - } - - return GICv3_IDLE_PRIORITY; -} - -static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr) -{ - return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; -} - -static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr) -{ - unsigned int bpr; - - if (vmcr & ICH_VMCR_CBPR_MASK) { - bpr = __vgic_v3_get_bpr0(vmcr); - if (bpr < 7) - bpr++; - } else { - bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; - } - - return bpr; -} - -/* - * Convert a priority to a preemption level, taking the relevant BPR - * into account by zeroing the sub-priority bits. - */ -static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp) -{ - unsigned int bpr; - - if (!grp) - bpr = __vgic_v3_get_bpr0(vmcr) + 1; - else - bpr = __vgic_v3_get_bpr1(vmcr); - - return pri & (GENMASK(7, 0) << bpr); -} - -/* - * The priority value is independent of any of the BPR values, so we - * normalize it using the minumal BPR value. This guarantees that no - * matter what the guest does with its BPR, we can always set/get the - * same value of a priority. - */ -static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp) -{ - u8 pre, ap; - u32 val; - int apr; - - pre = __vgic_v3_pri_to_pre(pri, vmcr, grp); - ap = pre >> __vgic_v3_bpr_min(); - apr = ap / 32; - - if (!grp) { - val = __vgic_v3_read_ap0rn(apr); - __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr); - } else { - val = __vgic_v3_read_ap1rn(apr); - __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr); - } -} - -static int __hyp_text __vgic_v3_clear_highest_active_priority(void) -{ - u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); - u32 hap = 0; - int i; - - for (i = 0; i < nr_apr_regs; i++) { - u32 ap0, ap1; - int c0, c1; - - ap0 = __vgic_v3_read_ap0rn(i); - ap1 = __vgic_v3_read_ap1rn(i); - if (!ap0 && !ap1) { - hap += 32; - continue; - } - - c0 = ap0 ? __ffs(ap0) : 32; - c1 = ap1 ? __ffs(ap1) : 32; - - /* Always clear the LSB, which is the highest priority */ - if (c0 < c1) { - ap0 &= ~BIT(c0); - __vgic_v3_write_ap0rn(ap0, i); - hap += c0; - } else { - ap1 &= ~BIT(c1); - __vgic_v3_write_ap1rn(ap1, i); - hap += c1; - } - - /* Rescale to 8 bits of priority */ - return hap << __vgic_v3_bpr_min(); - } - - return GICv3_IDLE_PRIORITY; -} - -static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 lr_val; - u8 lr_prio, pmr; - int lr, grp; - - grp = __vgic_v3_get_group(vcpu); - - lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); - if (lr < 0) - goto spurious; - - if (grp != !!(lr_val & ICH_LR_GROUP)) - goto spurious; - - pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; - lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; - if (pmr <= lr_prio) - goto spurious; - - if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp)) - goto spurious; - - lr_val &= ~ICH_LR_STATE; - /* No active state for LPIs */ - if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI) - lr_val |= ICH_LR_ACTIVE_BIT; - __gic_v3_set_lr(lr_val, lr); - __vgic_v3_set_active_priority(lr_prio, vmcr, grp); - vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); - return; - -spurious: - vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS); -} - -static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val) -{ - lr_val &= ~ICH_LR_ACTIVE_BIT; - if (lr_val & ICH_LR_HW) { - u32 pid; - - pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT; - gic_write_dir(pid); - } - - __gic_v3_set_lr(lr_val, lr); -} - -static void __hyp_text __vgic_v3_bump_eoicount(void) -{ - u32 hcr; - - hcr = read_gicreg(ICH_HCR_EL2); - hcr += 1 << ICH_HCR_EOIcount_SHIFT; - write_gicreg(hcr, ICH_HCR_EL2); -} - -static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 vid = vcpu_get_reg(vcpu, rt); - u64 lr_val; - int lr; - - /* EOImode == 0, nothing to be done here */ - if (!(vmcr & ICH_VMCR_EOIM_MASK)) - return; - - /* No deactivate to be performed on an LPI */ - if (vid >= VGIC_MIN_LPI) - return; - - lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); - if (lr == -1) { - __vgic_v3_bump_eoicount(); - return; - } - - __vgic_v3_clear_active_lr(lr, lr_val); -} - -static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u32 vid = vcpu_get_reg(vcpu, rt); - u64 lr_val; - u8 lr_prio, act_prio; - int lr, grp; - - grp = __vgic_v3_get_group(vcpu); - - /* Drop priority in any case */ - act_prio = __vgic_v3_clear_highest_active_priority(); - - /* If EOIing an LPI, no deactivate to be performed */ - if (vid >= VGIC_MIN_LPI) - return; - - /* EOImode == 1, nothing to be done here */ - if (vmcr & ICH_VMCR_EOIM_MASK) - return; - - lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); - if (lr == -1) { - __vgic_v3_bump_eoicount(); - return; - } - - lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; - - /* If priorities or group do not match, the guest has fscked-up. */ - if (grp != !!(lr_val & ICH_LR_GROUP) || - __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio) - return; - - /* Let's now perform the deactivation */ - __vgic_v3_clear_active_lr(lr, lr_val); -} - -static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK)); -} - -static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK)); -} - -static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 val = vcpu_get_reg(vcpu, rt); - - if (val & 1) - vmcr |= ICH_VMCR_ENG0_MASK; - else - vmcr &= ~ICH_VMCR_ENG0_MASK; - - __vgic_v3_write_vmcr(vmcr); -} - -static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 val = vcpu_get_reg(vcpu, rt); - - if (val & 1) - vmcr |= ICH_VMCR_ENG1_MASK; - else - vmcr &= ~ICH_VMCR_ENG1_MASK; - - __vgic_v3_write_vmcr(vmcr); -} - -static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr)); -} - -static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr)); -} - -static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 val = vcpu_get_reg(vcpu, rt); - u8 bpr_min = __vgic_v3_bpr_min() - 1; - - /* Enforce BPR limiting */ - if (val < bpr_min) - val = bpr_min; - - val <<= ICH_VMCR_BPR0_SHIFT; - val &= ICH_VMCR_BPR0_MASK; - vmcr &= ~ICH_VMCR_BPR0_MASK; - vmcr |= val; - - __vgic_v3_write_vmcr(vmcr); -} - -static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) -{ - u64 val = vcpu_get_reg(vcpu, rt); - u8 bpr_min = __vgic_v3_bpr_min(); - - if (vmcr & ICH_VMCR_CBPR_MASK) - return; - - /* Enforce BPR limiting */ - if (val < bpr_min) - val = bpr_min; - - val <<= ICH_VMCR_BPR1_SHIFT; - val &= ICH_VMCR_BPR1_MASK; - vmcr &= ~ICH_VMCR_BPR1_MASK; - vmcr |= val; - - __vgic_v3_write_vmcr(vmcr); -} - -static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n) -{ - u32 val; - - if (!__vgic_v3_get_group(vcpu)) - val = __vgic_v3_read_ap0rn(n); - else - val = __vgic_v3_read_ap1rn(n); - - vcpu_set_reg(vcpu, rt, val); -} - -static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n) -{ - u32 val = vcpu_get_reg(vcpu, rt); - - if (!__vgic_v3_get_group(vcpu)) - __vgic_v3_write_ap0rn(val, n); - else - __vgic_v3_write_ap1rn(val, n); -} - -static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_read_apxrn(vcpu, rt, 0); -} - -static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_read_apxrn(vcpu, rt, 1); -} - -static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_read_apxrn(vcpu, rt, 2); -} - -static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_read_apxrn(vcpu, rt, 3); -} - -static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_write_apxrn(vcpu, rt, 0); -} - -static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_write_apxrn(vcpu, rt, 1); -} - -static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_write_apxrn(vcpu, rt, 2); -} - -static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - __vgic_v3_write_apxrn(vcpu, rt, 3); -} - -static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u64 lr_val; - int lr, lr_grp, grp; - - grp = __vgic_v3_get_group(vcpu); - - lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); - if (lr == -1) - goto spurious; - - lr_grp = !!(lr_val & ICH_LR_GROUP); - if (lr_grp != grp) - lr_val = ICC_IAR1_EL1_SPURIOUS; - -spurious: - vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); -} - -static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - vmcr &= ICH_VMCR_PMR_MASK; - vmcr >>= ICH_VMCR_PMR_SHIFT; - vcpu_set_reg(vcpu, rt, vmcr); -} - -static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 val = vcpu_get_reg(vcpu, rt); - - val <<= ICH_VMCR_PMR_SHIFT; - val &= ICH_VMCR_PMR_MASK; - vmcr &= ~ICH_VMCR_PMR_MASK; - vmcr |= val; - - write_gicreg(vmcr, ICH_VMCR_EL2); -} - -static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 val = __vgic_v3_get_highest_active_priority(); - vcpu_set_reg(vcpu, rt, val); -} - -static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 vtr, val; - - vtr = read_gicreg(ICH_VTR_EL2); - /* PRIbits */ - val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; - /* IDbits */ - val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT; - /* A3V */ - val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; - /* EOImode */ - val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT; - /* CBPR */ - val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; - - vcpu_set_reg(vcpu, rt, val); -} - -static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, - u32 vmcr, int rt) -{ - u32 val = vcpu_get_reg(vcpu, rt); - - if (val & ICC_CTLR_EL1_CBPR_MASK) - vmcr |= ICH_VMCR_CBPR_MASK; - else - vmcr &= ~ICH_VMCR_CBPR_MASK; - - if (val & ICC_CTLR_EL1_EOImode_MASK) - vmcr |= ICH_VMCR_EOIM_MASK; - else - vmcr &= ~ICH_VMCR_EOIM_MASK; - - write_gicreg(vmcr, ICH_VMCR_EL2); -} - -int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) -{ - int rt; - u32 esr; - u32 vmcr; - void (*fn)(struct kvm_vcpu *, u32, int); - bool is_read; - u32 sysreg; - - esr = kvm_vcpu_get_hsr(vcpu); - if (vcpu_mode_is_32bit(vcpu)) { - if (!kvm_condition_valid(vcpu)) { - __kvm_skip_instr(vcpu); - return 1; - } - - sysreg = esr_cp15_to_sysreg(esr); - } else { - sysreg = esr_sys64_to_sysreg(esr); - } - - is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; - - switch (sysreg) { - case SYS_ICC_IAR0_EL1: - case SYS_ICC_IAR1_EL1: - if (unlikely(!is_read)) - return 0; - fn = __vgic_v3_read_iar; - break; - case SYS_ICC_EOIR0_EL1: - case SYS_ICC_EOIR1_EL1: - if (unlikely(is_read)) - return 0; - fn = __vgic_v3_write_eoir; - break; - case SYS_ICC_IGRPEN1_EL1: - if (is_read) - fn = __vgic_v3_read_igrpen1; - else - fn = __vgic_v3_write_igrpen1; - break; - case SYS_ICC_BPR1_EL1: - if (is_read) - fn = __vgic_v3_read_bpr1; - else - fn = __vgic_v3_write_bpr1; - break; - case SYS_ICC_AP0Rn_EL1(0): - case SYS_ICC_AP1Rn_EL1(0): - if (is_read) - fn = __vgic_v3_read_apxr0; - else - fn = __vgic_v3_write_apxr0; - break; - case SYS_ICC_AP0Rn_EL1(1): - case SYS_ICC_AP1Rn_EL1(1): - if (is_read) - fn = __vgic_v3_read_apxr1; - else - fn = __vgic_v3_write_apxr1; - break; - case SYS_ICC_AP0Rn_EL1(2): - case SYS_ICC_AP1Rn_EL1(2): - if (is_read) - fn = __vgic_v3_read_apxr2; - else - fn = __vgic_v3_write_apxr2; - break; - case SYS_ICC_AP0Rn_EL1(3): - case SYS_ICC_AP1Rn_EL1(3): - if (is_read) - fn = __vgic_v3_read_apxr3; - else - fn = __vgic_v3_write_apxr3; - break; - case SYS_ICC_HPPIR0_EL1: - case SYS_ICC_HPPIR1_EL1: - if (unlikely(!is_read)) - return 0; - fn = __vgic_v3_read_hppir; - break; - case SYS_ICC_IGRPEN0_EL1: - if (is_read) - fn = __vgic_v3_read_igrpen0; - else - fn = __vgic_v3_write_igrpen0; - break; - case SYS_ICC_BPR0_EL1: - if (is_read) - fn = __vgic_v3_read_bpr0; - else - fn = __vgic_v3_write_bpr0; - break; - case SYS_ICC_DIR_EL1: - if (unlikely(is_read)) - return 0; - fn = __vgic_v3_write_dir; - break; - case SYS_ICC_RPR_EL1: - if (unlikely(!is_read)) - return 0; - fn = __vgic_v3_read_rpr; - break; - case SYS_ICC_CTLR_EL1: - if (is_read) - fn = __vgic_v3_read_ctlr; - else - fn = __vgic_v3_write_ctlr; - break; - case SYS_ICC_PMR_EL1: - if (is_read) - fn = __vgic_v3_read_pmr; - else - fn = __vgic_v3_write_pmr; - break; - default: - return 0; - } - - vmcr = __vgic_v3_read_vmcr(); - rt = kvm_vcpu_sys_get_rt(vcpu); - fn(vcpu, vmcr, rt); - - __kvm_skip_instr(vcpu); - - return 1; -} - -#endif diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c deleted file mode 100644 index 08443a15e6be..000000000000 --- a/virt/kvm/arm/mmio.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include <linux/kvm_host.h> -#include <asm/kvm_mmio.h> -#include <asm/kvm_emulate.h> -#include <trace/events/kvm.h> - -#include "trace.h" - -void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) -{ - void *datap = NULL; - union { - u8 byte; - u16 hword; - u32 word; - u64 dword; - } tmp; - - switch (len) { - case 1: - tmp.byte = data; - datap = &tmp.byte; - break; - case 2: - tmp.hword = data; - datap = &tmp.hword; - break; - case 4: - tmp.word = data; - datap = &tmp.word; - break; - case 8: - tmp.dword = data; - datap = &tmp.dword; - break; - } - - memcpy(buf, datap, len); -} - -unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) -{ - unsigned long data = 0; - union { - u16 hword; - u32 word; - u64 dword; - } tmp; - - switch (len) { - case 1: - data = *(u8 *)buf; - break; - case 2: - memcpy(&tmp.hword, buf, len); - data = tmp.hword; - break; - case 4: - memcpy(&tmp.word, buf, len); - data = tmp.word; - break; - case 8: - memcpy(&tmp.dword, buf, len); - data = tmp.dword; - break; - } - - return data; -} - -/** - * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation - * or in-kernel IO emulation - * - * @vcpu: The VCPU pointer - * @run: The VCPU run struct containing the mmio data - */ -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - unsigned long data; - unsigned int len; - int mask; - - if (!run->mmio.is_write) { - len = run->mmio.len; - if (len > sizeof(unsigned long)) - return -EINVAL; - - data = kvm_mmio_read_buf(run->mmio.data, len); - - if (vcpu->arch.mmio_decode.sign_extend && - len < sizeof(unsigned long)) { - mask = 1U << ((len * 8) - 1); - data = (data ^ mask) - mask; - } - - trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, - &data); - data = vcpu_data_host_to_guest(vcpu, data, len); - vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); - } - - /* - * The MMIO instruction is emulated and should not be re-executed - * in the guest. - */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - - return 0; -} - -static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) -{ - unsigned long rt; - int access_size; - bool sign_extend; - - if (kvm_vcpu_dabt_iss1tw(vcpu)) { - /* page table accesses IO mem: tell guest to fix its TTBR */ - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - return 1; - } - - access_size = kvm_vcpu_dabt_get_as(vcpu); - if (unlikely(access_size < 0)) - return access_size; - - *is_write = kvm_vcpu_dabt_iswrite(vcpu); - sign_extend = kvm_vcpu_dabt_issext(vcpu); - rt = kvm_vcpu_dabt_get_rd(vcpu); - - *len = access_size; - vcpu->arch.mmio_decode.sign_extend = sign_extend; - vcpu->arch.mmio_decode.rt = rt; - - return 0; -} - -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa) -{ - unsigned long data; - unsigned long rt; - int ret; - bool is_write; - int len; - u8 data_buf[8]; - - /* - * Prepare MMIO operation. First decode the syndrome data we get - * from the CPU. Then try if some in-kernel emulation feels - * responsible, otherwise let user space do its magic. - */ - if (kvm_vcpu_dabt_isvalid(vcpu)) { - ret = decode_hsr(vcpu, &is_write, &len); - if (ret) - return ret; - } else { - kvm_err("load/store instruction decoding not implemented\n"); - return -ENOSYS; - } - - rt = vcpu->arch.mmio_decode.rt; - - if (is_write) { - data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), - len); - - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); - kvm_mmio_write_buf(data_buf, len, data); - - ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, - data_buf); - } else { - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, - fault_ipa, NULL); - - ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, - data_buf); - } - - /* Now prepare kvm_run for the potential return to userland. */ - run->mmio.is_write = is_write; - run->mmio.phys_addr = fault_ipa; - run->mmio.len = len; - - if (!ret) { - /* We handled the access successfully in the kernel. */ - if (!is_write) - memcpy(run->mmio.data, data_buf, len); - vcpu->stat.mmio_exit_kernel++; - kvm_handle_mmio_return(vcpu, run); - return 1; - } - - if (is_write) - memcpy(run->mmio.data, data_buf, len); - vcpu->stat.mmio_exit_user++; - run->exit_reason = KVM_EXIT_MMIO; - return 0; -} diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c deleted file mode 100644 index fbdf3ac2f001..000000000000 --- a/virt/kvm/arm/mmu.c +++ /dev/null @@ -1,2439 +0,0 @@ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include <linux/mman.h> -#include <linux/kvm_host.h> -#include <linux/io.h> -#include <linux/hugetlb.h> -#include <linux/sched/signal.h> -#include <trace/events/kvm.h> -#include <asm/pgalloc.h> -#include <asm/cacheflush.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_mmu.h> -#include <asm/kvm_mmio.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_emulate.h> -#include <asm/virt.h> -#include <asm/system_misc.h> - -#include "trace.h" - -static pgd_t *boot_hyp_pgd; -static pgd_t *hyp_pgd; -static pgd_t *merged_hyp_pgd; -static DEFINE_MUTEX(kvm_hyp_pgd_mutex); - -static unsigned long hyp_idmap_start; -static unsigned long hyp_idmap_end; -static phys_addr_t hyp_idmap_vector; - -static unsigned long io_map_base; - -#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) - -#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) -#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) - -static bool memslot_is_logging(struct kvm_memory_slot *memslot) -{ - return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); -} - -/** - * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 - * @kvm: pointer to kvm structure. - * - * Interface to HYP function to flush all VM TLB entries - */ -void kvm_flush_remote_tlbs(struct kvm *kvm) -{ - kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); -} - -static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) -{ - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); -} - -/* - * D-Cache management functions. They take the page table entries by - * value, as they are flushing the cache using the kernel mapping (or - * kmap on 32bit). - */ -static void kvm_flush_dcache_pte(pte_t pte) -{ - __kvm_flush_dcache_pte(pte); -} - -static void kvm_flush_dcache_pmd(pmd_t pmd) -{ - __kvm_flush_dcache_pmd(pmd); -} - -static void kvm_flush_dcache_pud(pud_t pud) -{ - __kvm_flush_dcache_pud(pud); -} - -static bool kvm_is_device_pfn(unsigned long pfn) -{ - return !pfn_valid(pfn); -} - -/** - * stage2_dissolve_pmd() - clear and flush huge PMD entry - * @kvm: pointer to kvm structure. - * @addr: IPA - * @pmd: pmd pointer for IPA - * - * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all - * pages in the range dirty. - */ -static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) -{ - if (!pmd_thp_or_huge(*pmd)) - return; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - put_page(virt_to_page(pmd)); -} - -/** - * stage2_dissolve_pud() - clear and flush huge PUD entry - * @kvm: pointer to kvm structure. - * @addr: IPA - * @pud: pud pointer for IPA - * - * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all - * pages in the range dirty. - */ -static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) -{ - if (!stage2_pud_huge(kvm, *pudp)) - return; - - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(kvm, addr); - put_page(virt_to_page(pudp)); -} - -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - int min, int max) -{ - void *page; - - BUG_ON(max > KVM_NR_MEM_OBJS); - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < max) { - page = (void *)__get_free_page(PGALLOC_GFP); - if (!page) - return -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc || !mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; -} - -static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) -{ - pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); - stage2_pgd_clear(kvm, pgd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pud_free(kvm, pud_table); - put_page(virt_to_page(pgd)); -} - -static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) -{ - pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); - VM_BUG_ON(stage2_pud_huge(kvm, *pud)); - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pmd_free(kvm, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - pte_free_kernel(NULL, pte_table); - put_page(virt_to_page(pmd)); -} - -static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) -{ - WRITE_ONCE(*ptep, new_pte); - dsb(ishst); -} - -static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) -{ - WRITE_ONCE(*pmdp, new_pmd); - dsb(ishst); -} - -static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) -{ - kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); -} - -static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) -{ - WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); - dsb(ishst); -} - -static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) -{ - WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); - dsb(ishst); -} - -/* - * Unmapping vs dcache management: - * - * If a guest maps certain memory pages as uncached, all writes will - * bypass the data cache and go directly to RAM. However, the CPUs - * can still speculate reads (not writes) and fill cache lines with - * data. - * - * Those cache lines will be *clean* cache lines though, so a - * clean+invalidate operation is equivalent to an invalidate - * operation, because no cache lines are marked dirty. - * - * Those clean cache lines could be filled prior to an uncached write - * by the guest, and the cache coherent IO subsystem would therefore - * end up writing old data to disk. - * - * This is why right after unmapping a page/section and invalidating - * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure - * the IO subsystem will never hit in the cache. - * - * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as - * we then fully enforce cacheability of RAM, no matter what the guest - * does. - */ -static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t start_addr = addr; - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - pte_t old_pte = *pte; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(kvm, addr); - - /* No need to invalidate the cache for device mappings */ - if (!kvm_is_device_pfn(pte_pfn(old_pte))) - kvm_flush_dcache_pte(old_pte); - - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (stage2_pte_table_empty(kvm, start_pte)) - clear_stage2_pmd_entry(kvm, pmd, start_addr); -} - -static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next, start_addr = addr; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - pmd_t old_pmd = *pmd; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - - kvm_flush_dcache_pmd(old_pmd); - - put_page(virt_to_page(pmd)); - } else { - unmap_stage2_ptes(kvm, pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); - - if (stage2_pmd_table_empty(kvm, start_pmd)) - clear_stage2_pud_entry(kvm, pud, start_addr); -} - -static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next, start_addr = addr; - pud_t *pud, *start_pud; - - start_pud = pud = stage2_pud_offset(kvm, pgd, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - pud_t old_pud = *pud; - - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(kvm, addr); - kvm_flush_dcache_pud(old_pud); - put_page(virt_to_page(pud)); - } else { - unmap_stage2_pmds(kvm, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); - - if (stage2_pud_table_empty(kvm, start_pud)) - clear_stage2_pgd_entry(kvm, pgd, start_addr); -} - -/** - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range - * @kvm: The VM pointer - * @start: The intermediate physical base address of the range to unmap - * @size: The size of the area to unmap - * - * Clear a range of stage-2 mappings, lowering the various ref-counts. Must - * be called while holding mmu_lock (unless for freeing the stage2 pgd before - * destroying the VM), otherwise another faulting VCPU may come in and mess - * with things behind our backs. - */ -static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) -{ - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; - - assert_spin_locked(&kvm->mmu_lock); - WARN_ON(size & ~PAGE_MASK); - - pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Make sure the page table is still active, as another thread - * could have possibly freed the page table, while we released - * the lock. - */ - if (!READ_ONCE(kvm->arch.pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - unmap_stage2_puds(kvm, pgd, addr, next); - /* - * If the range is too large, release the kvm->mmu_lock - * to prevent starvation and lockup detector warnings. - */ - if (next != end) - cond_resched_lock(&kvm->mmu_lock); - } while (pgd++, addr = next, addr != end); -} - -static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) - kvm_flush_dcache_pte(*pte); - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) - kvm_flush_dcache_pmd(*pmd); - else - stage2_flush_ptes(kvm, pmd, addr, next); - } - } while (pmd++, addr = next, addr != end); -} - -static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, pgd, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) - kvm_flush_dcache_pud(*pud); - else - stage2_flush_pmds(kvm, pud, addr, next); - } - } while (pud++, addr = next, addr != end); -} - -static void stage2_flush_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; - phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - phys_addr_t next; - pgd_t *pgd; - - pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); - do { - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - stage2_flush_puds(kvm, pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} - -/** - * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 - * @kvm: The struct kvm pointer - * - * Go through the stage 2 page tables and invalidate any cache lines - * backing memory already mapped to the VM. - */ -static void stage2_flush_vm(struct kvm *kvm) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int idx; - - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); - - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) - stage2_flush_memslot(kvm, memslot); - - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); -} - -static void clear_hyp_pgd_entry(pgd_t *pgd) -{ - pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); - pgd_clear(pgd); - pud_free(NULL, pud_table); - put_page(virt_to_page(pgd)); -} - -static void clear_hyp_pud_entry(pud_t *pud) -{ - pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); - VM_BUG_ON(pud_huge(*pud)); - pud_clear(pud); - pmd_free(NULL, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_hyp_pmd_entry(pmd_t *pmd) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - pte_free_kernel(NULL, pte_table); - put_page(virt_to_page(pmd)); -} - -static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - kvm_set_pte(pte, __pte(0)); - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (hyp_pte_table_empty(start_pte)) - clear_hyp_pmd_entry(pmd); -} - -static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - /* Hyp doesn't use huge pmds */ - if (!pmd_none(*pmd)) - unmap_hyp_ptes(pmd, addr, next); - } while (pmd++, addr = next, addr != end); - - if (hyp_pmd_table_empty(start_pmd)) - clear_hyp_pud_entry(pud); -} - -static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pud_t *pud, *start_pud; - - start_pud = pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - /* Hyp doesn't use huge puds */ - if (!pud_none(*pud)) - unmap_hyp_pmds(pud, addr, next); - } while (pud++, addr = next, addr != end); - - if (hyp_pud_table_empty(start_pud)) - clear_hyp_pgd_entry(pgd); -} - -static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) -{ - return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); -} - -static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, - phys_addr_t start, u64 size) -{ - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; - - /* - * We don't unmap anything from HYP, except at the hyp tear down. - * Hence, we don't have to invalidate the TLBs here. - */ - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - do { - next = pgd_addr_end(addr, end); - if (!pgd_none(*pgd)) - unmap_hyp_puds(pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} - -static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); -} - -static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); -} - -/** - * free_hyp_pgds - free Hyp-mode page tables - * - * Assumes hyp_pgd is a page table used strictly in Hyp-mode and - * therefore contains either mappings in the kernel memory area (above - * PAGE_OFFSET), or device mappings in the idmap range. - * - * boot_hyp_pgd should only map the idmap range, and is only used in - * the extended idmap case. - */ -void free_hyp_pgds(void) -{ - pgd_t *id_pgd; - - mutex_lock(&kvm_hyp_pgd_mutex); - - id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; - - if (id_pgd) { - /* In case we never called hyp_mmu_init() */ - if (!io_map_base) - io_map_base = hyp_idmap_start; - unmap_hyp_idmap_range(id_pgd, io_map_base, - hyp_idmap_start + PAGE_SIZE - io_map_base); - } - - if (boot_hyp_pgd) { - free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); - boot_hyp_pgd = NULL; - } - - if (hyp_pgd) { - unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), - (uintptr_t)high_memory - PAGE_OFFSET); - - free_pages((unsigned long)hyp_pgd, hyp_pgd_order); - hyp_pgd = NULL; - } - if (merged_hyp_pgd) { - clear_page(merged_hyp_pgd); - free_page((unsigned long)merged_hyp_pgd); - merged_hyp_pgd = NULL; - } - - mutex_unlock(&kvm_hyp_pgd_mutex); -} - -static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pte_t *pte; - unsigned long addr; - - addr = start; - do { - pte = pte_offset_kernel(pmd, addr); - kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); - get_page(virt_to_page(pte)); - pfn++; - } while (addr += PAGE_SIZE, addr != end); -} - -static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pmd_t *pmd; - pte_t *pte; - unsigned long addr, next; - - addr = start; - do { - pmd = pmd_offset(pud, addr); - - BUG_ON(pmd_sect(*pmd)); - - if (pmd_none(*pmd)) { - pte = pte_alloc_one_kernel(NULL); - if (!pte) { - kvm_err("Cannot allocate Hyp pte\n"); - return -ENOMEM; - } - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - next = pmd_addr_end(addr, end); - - create_hyp_pte_mappings(pmd, addr, next, pfn, prot); - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pud_t *pud; - pmd_t *pmd; - unsigned long addr, next; - int ret; - - addr = start; - do { - pud = pud_offset(pgd, addr); - - if (pud_none_or_clear_bad(pud)) { - pmd = pmd_alloc_one(NULL, addr); - if (!pmd) { - kvm_err("Cannot allocate Hyp pmd\n"); - return -ENOMEM; - } - kvm_pud_populate(pud, pmd); - get_page(virt_to_page(pud)); - } - - next = pud_addr_end(addr, end); - ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); - if (ret) - return ret; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, - unsigned long start, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - pgd_t *pgd; - pud_t *pud; - unsigned long addr, next; - int err = 0; - - mutex_lock(&kvm_hyp_pgd_mutex); - addr = start & PAGE_MASK; - end = PAGE_ALIGN(end); - do { - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - - if (pgd_none(*pgd)) { - pud = pud_alloc_one(NULL, addr); - if (!pud) { - kvm_err("Cannot allocate Hyp pud\n"); - err = -ENOMEM; - goto out; - } - kvm_pgd_populate(pgd, pud); - get_page(virt_to_page(pgd)); - } - - next = pgd_addr_end(addr, end); - err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); - if (err) - goto out; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); -out: - mutex_unlock(&kvm_hyp_pgd_mutex); - return err; -} - -static phys_addr_t kvm_kaddr_to_phys(void *kaddr) -{ - if (!is_vmalloc_addr(kaddr)) { - BUG_ON(!virt_addr_valid(kaddr)); - return __pa(kaddr); - } else { - return page_to_phys(vmalloc_to_page(kaddr)) + - offset_in_page(kaddr); - } -} - -/** - * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode - * @from: The virtual kernel start address of the range - * @to: The virtual kernel end address of the range (exclusive) - * @prot: The protection to be applied to this range - * - * The same virtual address as the kernel virtual address is also used - * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying - * physical pages. - */ -int create_hyp_mappings(void *from, void *to, pgprot_t prot) -{ - phys_addr_t phys_addr; - unsigned long virt_addr; - unsigned long start = kern_hyp_va((unsigned long)from); - unsigned long end = kern_hyp_va((unsigned long)to); - - if (is_kernel_in_hyp_mode()) - return 0; - - start = start & PAGE_MASK; - end = PAGE_ALIGN(end); - - for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { - int err; - - phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); - err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, - virt_addr, virt_addr + PAGE_SIZE, - __phys_to_pfn(phys_addr), - prot); - if (err) - return err; - } - - return 0; -} - -static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, - unsigned long *haddr, pgprot_t prot) -{ - pgd_t *pgd = hyp_pgd; - unsigned long base; - int ret = 0; - - mutex_lock(&kvm_hyp_pgd_mutex); - - /* - * This assumes that we we have enough space below the idmap - * page to allocate our VAs. If not, the check below will - * kick. A potential alternative would be to detect that - * overflow and switch to an allocation above the idmap. - * - * The allocated size is always a multiple of PAGE_SIZE. - */ - size = PAGE_ALIGN(size + offset_in_page(phys_addr)); - base = io_map_base - size; - - /* - * Verify that BIT(VA_BITS - 1) hasn't been flipped by - * allocating the new area, as it would indicate we've - * overflowed the idmap/IO address range. - */ - if ((base ^ io_map_base) & BIT(VA_BITS - 1)) - ret = -ENOMEM; - else - io_map_base = base; - - mutex_unlock(&kvm_hyp_pgd_mutex); - - if (ret) - goto out; - - if (__kvm_cpu_uses_extended_idmap()) - pgd = boot_hyp_pgd; - - ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - base, base + size, - __phys_to_pfn(phys_addr), prot); - if (ret) - goto out; - - *haddr = base + offset_in_page(phys_addr); - -out: - return ret; -} - -/** - * create_hyp_io_mappings - Map IO into both kernel and HYP - * @phys_addr: The physical start address which gets mapped - * @size: Size of the region being mapped - * @kaddr: Kernel VA for this mapping - * @haddr: HYP VA for this mapping - */ -int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, - void __iomem **kaddr, - void __iomem **haddr) -{ - unsigned long addr; - int ret; - - *kaddr = ioremap(phys_addr, size); - if (!*kaddr) - return -ENOMEM; - - if (is_kernel_in_hyp_mode()) { - *haddr = *kaddr; - return 0; - } - - ret = __create_hyp_private_mapping(phys_addr, size, - &addr, PAGE_HYP_DEVICE); - if (ret) { - iounmap(*kaddr); - *kaddr = NULL; - *haddr = NULL; - return ret; - } - - *haddr = (void __iomem *)addr; - return 0; -} - -/** - * create_hyp_exec_mappings - Map an executable range into HYP - * @phys_addr: The physical start address which gets mapped - * @size: Size of the region being mapped - * @haddr: HYP VA for this mapping - */ -int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, - void **haddr) -{ - unsigned long addr; - int ret; - - BUG_ON(is_kernel_in_hyp_mode()); - - ret = __create_hyp_private_mapping(phys_addr, size, - &addr, PAGE_HYP_EXEC); - if (ret) { - *haddr = NULL; - return ret; - } - - *haddr = (void *)addr; - return 0; -} - -/** - * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. - * @kvm: The KVM struct pointer for the VM. - * - * Allocates only the stage-2 HW PGD level table(s) (can support either full - * 40-bit input addresses or limited to 32-bit input addresses). Clears the - * allocated pages. - * - * Note we don't need locking here as this is only called when the VM is - * created, which can only be done once. - */ -int kvm_alloc_stage2_pgd(struct kvm *kvm) -{ - pgd_t *pgd; - - if (kvm->arch.pgd != NULL) { - kvm_err("kvm_arch already initialized?\n"); - return -EINVAL; - } - - /* Allocate the HW PGD, making sure that each page gets its own refcount */ - pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); - if (!pgd) - return -ENOMEM; - - kvm->arch.pgd = pgd; - return 0; -} - -static void stage2_unmap_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - hva_t hva = memslot->userspace_addr; - phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; - phys_addr_t size = PAGE_SIZE * memslot->npages; - hva_t reg_end = hva + size; - - /* - * A memory region could potentially cover multiple VMAs, and any holes - * between them, so iterate over all of them to find out if we should - * unmap any of them. - * - * +--------------------------------------------+ - * +---------------+----------------+ +----------------+ - * | : VMA 1 | VMA 2 | | VMA 3 : | - * +---------------+----------------+ +----------------+ - * | memory region | - * +--------------------------------------------+ - */ - do { - struct vm_area_struct *vma = find_vma(current->mm, hva); - hva_t vm_start, vm_end; - - if (!vma || vma->vm_start >= reg_end) - break; - - /* - * Take the intersection of this VMA with the memory region - */ - vm_start = max(hva, vma->vm_start); - vm_end = min(reg_end, vma->vm_end); - - if (!(vma->vm_flags & VM_PFNMAP)) { - gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - unmap_stage2_range(kvm, gpa, vm_end - vm_start); - } - hva = vm_end; - } while (hva < reg_end); -} - -/** - * stage2_unmap_vm - Unmap Stage-2 RAM mappings - * @kvm: The struct kvm pointer - * - * Go through the memregions and unmap any reguler RAM - * backing memory already mapped to the VM. - */ -void stage2_unmap_vm(struct kvm *kvm) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int idx; - - idx = srcu_read_lock(&kvm->srcu); - down_read(¤t->mm->mmap_sem); - spin_lock(&kvm->mmu_lock); - - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) - stage2_unmap_memslot(kvm, memslot); - - spin_unlock(&kvm->mmu_lock); - up_read(¤t->mm->mmap_sem); - srcu_read_unlock(&kvm->srcu, idx); -} - -/** - * kvm_free_stage2_pgd - free all stage-2 tables - * @kvm: The KVM struct pointer for the VM. - * - * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all - * underlying level-2 and level-3 tables before freeing the actual level-1 table - * and setting the struct pointer to NULL. - */ -void kvm_free_stage2_pgd(struct kvm *kvm) -{ - void *pgd = NULL; - - spin_lock(&kvm->mmu_lock); - if (kvm->arch.pgd) { - unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); - pgd = READ_ONCE(kvm->arch.pgd); - kvm->arch.pgd = NULL; - } - spin_unlock(&kvm->mmu_lock); - - /* Free the HW pgd, one page at a time */ - if (pgd) - free_pages_exact(pgd, stage2_pgd_size(kvm)); -} - -static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - pgd_t *pgd; - pud_t *pud; - - pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); - if (stage2_pgd_none(kvm, *pgd)) { - if (!cache) - return NULL; - pud = mmu_memory_cache_alloc(cache); - stage2_pgd_populate(kvm, pgd, pud); - get_page(virt_to_page(pgd)); - } - - return stage2_pud_offset(kvm, pgd, addr); -} - -static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - pud_t *pud; - pmd_t *pmd; - - pud = stage2_get_pud(kvm, cache, addr); - if (!pud || stage2_pud_huge(kvm, *pud)) - return NULL; - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return NULL; - pmd = mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - return stage2_pmd_offset(kvm, pud, addr); -} - -static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache - *cache, phys_addr_t addr, const pmd_t *new_pmd) -{ - pmd_t *pmd, old_pmd; - - pmd = stage2_get_pmd(kvm, cache, addr); - VM_BUG_ON(!pmd); - - old_pmd = *pmd; - if (pmd_present(old_pmd)) { - /* - * Multiple vcpus faulting on the same PMD entry, can - * lead to them sequentially updating the PMD with the - * same value. Following the break-before-make - * (pmd_clear() followed by tlb_flush()) process can - * hinder forward progress due to refaults generated - * on missing translations. - * - * Skip updating the page table if the entry is - * unchanged. - */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - - /* - * Mapping in huge pages should only happen through a - * fault. If a page is merged into a transparent huge - * page, the individual subpages of that huge page - * should be unmapped through MMU notifiers before we - * get here. - * - * Merging of CompoundPages is not supported; they - * should become splitting first, unmapped, merged, - * and mapped back in on-demand. - */ - VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - } else { - get_page(virt_to_page(pmd)); - } - - kvm_set_pmd(pmd, *new_pmd); - return 0; -} - -static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pud_t *new_pudp) -{ - pud_t *pudp, old_pud; - - pudp = stage2_get_pud(kvm, cache, addr); - VM_BUG_ON(!pudp); - - old_pud = *pudp; - - /* - * A large number of vcpus faulting on the same stage 2 entry, - * can lead to a refault due to the - * stage2_pud_clear()/tlb_flush(). Skip updating the page - * tables if there is no change. - */ - if (pud_val(old_pud) == pud_val(*new_pudp)) - return 0; - - if (stage2_pud_present(kvm, old_pud)) { - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(kvm, addr); - } else { - get_page(virt_to_page(pudp)); - } - - kvm_set_pud(pudp, *new_pudp); - return 0; -} - -/* - * stage2_get_leaf_entry - walk the stage2 VM page tables and return - * true if a valid and present leaf-entry is found. A pointer to the - * leaf-entry is returned in the appropriate level variable - pudpp, - * pmdpp, ptepp. - */ -static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, - pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) -{ - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - *pudpp = NULL; - *pmdpp = NULL; - *ptepp = NULL; - - pudp = stage2_get_pud(kvm, NULL, addr); - if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) - return false; - - if (stage2_pud_huge(kvm, *pudp)) { - *pudpp = pudp; - return true; - } - - pmdp = stage2_pmd_offset(kvm, pudp, addr); - if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) - return false; - - if (pmd_thp_or_huge(*pmdp)) { - *pmdpp = pmdp; - return true; - } - - ptep = pte_offset_kernel(pmdp, addr); - if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) - return false; - - *ptepp = ptep; - return true; -} - -static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) -{ - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - bool found; - - found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); - if (!found) - return false; - - if (pudp) - return kvm_s2pud_exec(pudp); - else if (pmdp) - return kvm_s2pmd_exec(pmdp); - else - return kvm_s2pte_exec(ptep); -} - -static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, - unsigned long flags) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte, old_pte; - bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; - bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; - - VM_BUG_ON(logging_active && !cache); - - /* Create stage-2 page table mapping - Levels 0 and 1 */ - pud = stage2_get_pud(kvm, cache, addr); - if (!pud) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PUD, then continue - * on to allocate page. - */ - if (logging_active) - stage2_dissolve_pud(kvm, addr, pud); - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pmd = mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - pmd = stage2_pmd_offset(kvm, pud, addr); - if (!pmd) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PMD, then continue on to - * allocate page. - */ - if (logging_active) - stage2_dissolve_pmd(kvm, addr, pmd); - - /* Create stage-2 page mappings - Level 2 */ - if (pmd_none(*pmd)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pte = mmu_memory_cache_alloc(cache); - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - pte = pte_offset_kernel(pmd, addr); - - if (iomap && pte_present(*pte)) - return -EFAULT; - - /* Create 2nd stage page table mapping - Level 3 */ - old_pte = *pte; - if (pte_present(old_pte)) { - /* Skip page table update if there is no change */ - if (pte_val(old_pte) == pte_val(*new_pte)) - return 0; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(kvm, addr); - } else { - get_page(virt_to_page(pte)); - } - - kvm_set_pte(pte, *new_pte); - return 0; -} - -#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - if (pte_young(*pte)) { - *pte = pte_mkold(*pte); - return 1; - } - return 0; -} -#else -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - return __ptep_test_and_clear_young(pte); -} -#endif - -static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pmd); -} - -static int stage2_pudp_test_and_clear_young(pud_t *pud) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pud); -} - -/** - * kvm_phys_addr_ioremap - map a device range to guest IPA - * - * @kvm: The KVM pointer - * @guest_ipa: The IPA at which to insert the mapping - * @pa: The physical address of the device - * @size: The size of the mapping - */ -int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, - phys_addr_t pa, unsigned long size, bool writable) -{ - phys_addr_t addr, end; - int ret = 0; - unsigned long pfn; - struct kvm_mmu_memory_cache cache = { 0, }; - - end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; - pfn = __phys_to_pfn(pa); - - for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { - pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); - - if (writable) - pte = kvm_s2pte_mkwrite(pte); - - ret = mmu_topup_memory_cache(&cache, - kvm_mmu_cache_min_pages(kvm), - KVM_NR_MEM_OBJS); - if (ret) - goto out; - spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(kvm, &cache, addr, &pte, - KVM_S2PTE_FLAG_IS_IOMAP); - spin_unlock(&kvm->mmu_lock); - if (ret) - goto out; - - pfn++; - } - -out: - mmu_free_memory_cache(&cache); - return ret; -} - -static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) -{ - kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *ipap >> PAGE_SHIFT; - struct page *page = pfn_to_page(pfn); - - /* - * PageTransCompoundMap() returns true for THP and - * hugetlbfs. Make sure the adjustment is done only for THP - * pages. - */ - if (!PageHuge(page) && PageTransCompoundMap(page)) { - unsigned long mask; - /* - * The address we faulted on is backed by a transparent huge - * page. However, because we map the compound huge page and - * not the individual tail page, we need to transfer the - * refcount to the head page. We have to be careful that the - * THP doesn't start to split while we are adjusting the - * refcounts. - * - * We are sure this doesn't happen, because mmu_notifier_retry - * was successful and we are holding the mmu_lock, so if this - * THP is trying to split, it will be blocked in the mmu - * notifier before touching any of the pages, specifically - * before being able to call __split_huge_page_refcount(). - * - * We can therefore safely transfer the refcount from PG_tail - * to PG_head and switch the pfn from a tail page to the head - * page accordingly. - */ - mask = PTRS_PER_PMD - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - *ipap &= PMD_MASK; - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } - - return true; - } - - return false; -} - -static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) -{ - if (kvm_vcpu_trap_is_iabt(vcpu)) - return false; - - return kvm_vcpu_dabt_iswrite(vcpu); -} - -/** - * stage2_wp_ptes - write protect PMD range - * @pmd: pointer to pmd entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - if (!kvm_s2pte_readonly(pte)) - kvm_set_s2pte_readonly(pte); - } - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -/** - * stage2_wp_pmds - write protect PUD range - * kvm: kvm instance for the VM - * @pud: pointer to pud entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - if (!kvm_s2pmd_readonly(pmd)) - kvm_set_s2pmd_readonly(pmd); - } else { - stage2_wp_ptes(pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); -} - -/** - * stage2_wp_puds - write protect PGD range - * @pgd: pointer to pgd entry - * @addr: range start address - * @end: range end address - * - * Process PUD entries, for a huge PUD we cause a panic. - */ -static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, pgd, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - if (!kvm_s2pud_readonly(pud)) - kvm_set_s2pud_readonly(pud); - } else { - stage2_wp_pmds(kvm, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); -} - -/** - * stage2_wp_range() - write protect stage2 memory region range - * @kvm: The KVM pointer - * @addr: Start address of range - * @end: End address of range - */ -static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - pgd_t *pgd; - phys_addr_t next; - - pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Release kvm_mmu_lock periodically if the memory region is - * large. Otherwise, we may see kernel panics with - * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, - * CONFIG_LOCKDEP. Additionally, holding the lock too long - * will also starve other vCPUs. We have to also make sure - * that the page tables are not freed while we released - * the lock. - */ - cond_resched_lock(&kvm->mmu_lock); - if (!READ_ONCE(kvm->arch.pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (stage2_pgd_present(kvm, *pgd)) - stage2_wp_puds(kvm, pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} - -/** - * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot - * @kvm: The KVM pointer - * @slot: The memory slot to write protect - * - * Called to start logging dirty pages after memory region - * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns - * all present PUD, PMD and PTEs are write protected in the memory region. - * Afterwards read of dirty page log can be called. - * - * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, - * serializing operations for VM memory regions. - */ -void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) -{ - struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); - phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; - phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; - - spin_lock(&kvm->mmu_lock); - stage2_wp_range(kvm, start, end); - spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); -} - -/** - * kvm_mmu_write_protect_pt_masked() - write protect dirty pages - * @kvm: The KVM pointer - * @slot: The memory slot associated with mask - * @gfn_offset: The gfn offset in memory slot - * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory - * slot to be write protected - * - * Walks bits set in mask write protects the associated pte's. Caller must - * acquire kvm_mmu_lock. - */ -static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) -{ - phys_addr_t base_gfn = slot->base_gfn + gfn_offset; - phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; - phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; - - stage2_wp_range(kvm, start, end); -} - -/* - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * dirty pages. - * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. - */ -void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) -{ - kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); -} - -static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) -{ - __clean_dcache_guest_page(pfn, size); -} - -static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) -{ - __invalidate_icache_guest_page(pfn, size); -} - -static void kvm_send_hwpoison_signal(unsigned long address, - struct vm_area_struct *vma) -{ - short lsb; - - if (is_vm_hugetlb_page(vma)) - lsb = huge_page_shift(hstate_vma(vma)); - else - lsb = PAGE_SHIFT; - - send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); -} - -static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, - unsigned long hva) -{ - gpa_t gpa_start, gpa_end; - hva_t uaddr_start, uaddr_end; - size_t size; - - size = memslot->npages * PAGE_SIZE; - - gpa_start = memslot->base_gfn << PAGE_SHIFT; - gpa_end = gpa_start + size; - - uaddr_start = memslot->userspace_addr; - uaddr_end = uaddr_start + size; - - /* - * Pages belonging to memslots that don't have the same alignment - * within a PMD for userspace and IPA cannot be mapped with stage-2 - * PMD entries, because we'll end up mapping the wrong pages. - * - * Consider a layout like the following: - * - * memslot->userspace_addr: - * +-----+--------------------+--------------------+---+ - * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz| - * +-----+--------------------+--------------------+---+ - * - * memslot->base_gfn << PAGE_SIZE: - * +---+--------------------+--------------------+-----+ - * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz| - * +---+--------------------+--------------------+-----+ - * - * If we create those stage-2 PMDs, we'll end up with this incorrect - * mapping: - * d -> f - * e -> g - * f -> h - */ - if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK)) - return false; - - /* - * Next, let's make sure we're not trying to map anything not covered - * by the memslot. This means we have to prohibit PMD size mappings - * for the beginning and end of a non-PMD aligned and non-PMD sized - * memory slot (illustrated by the head and tail parts of the - * userspace view above containing pages 'abcde' and 'xyz', - * respectively). - * - * Note that it doesn't matter if we do the check using the - * userspace_addr or the base_gfn, as both are equally aligned (per - * the check above) and equally sized. - */ - return (hva & S2_PMD_MASK) >= uaddr_start && - (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end; -} - -static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_memory_slot *memslot, unsigned long hva, - unsigned long fault_status) -{ - int ret; - bool write_fault, writable, force_pte = false; - bool exec_fault, needs_exec; - unsigned long mmu_seq; - gfn_t gfn = fault_ipa >> PAGE_SHIFT; - struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; - struct vm_area_struct *vma; - kvm_pfn_t pfn; - pgprot_t mem_type = PAGE_S2; - bool logging_active = memslot_is_logging(memslot); - unsigned long vma_pagesize, flags = 0; - - write_fault = kvm_is_write_fault(vcpu); - exec_fault = kvm_vcpu_trap_is_iabt(vcpu); - VM_BUG_ON(write_fault && exec_fault); - - if (fault_status == FSC_PERM && !write_fault && !exec_fault) { - kvm_err("Unexpected L2 read permission error\n"); - return -EFAULT; - } - - if (!fault_supports_stage2_pmd_mappings(memslot, hva)) - force_pte = true; - - if (logging_active) - force_pte = true; - - /* Let's check if we will get back a huge page backed by hugetlbfs */ - down_read(¤t->mm->mmap_sem); - vma = find_vma_intersection(current->mm, hva, hva + 1); - if (unlikely(!vma)) { - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); - up_read(¤t->mm->mmap_sem); - return -EFAULT; - } - - vma_pagesize = vma_kernel_pagesize(vma); - /* - * PUD level may not exist for a VM but PMD is guaranteed to - * exist. - */ - if ((vma_pagesize == PMD_SIZE || - (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) && - !force_pte) { - gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; - } - up_read(¤t->mm->mmap_sem); - - /* We need minimum second+third level pages */ - ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), - KVM_NR_MEM_OBJS); - if (ret) - return ret; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* - * Ensure the read of mmu_notifier_seq happens before we call - * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk - * the page we just got a reference to gets unmapped before we have a - * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_hva will take it away - * from us again properly. This smp_rmb() interacts with the smp_wmb() - * in kvm_mmu_notifier_invalidate_<page|range_end>. - */ - smp_rmb(); - - pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(hva, vma); - return 0; - } - if (is_error_noslot_pfn(pfn)) - return -EFAULT; - - if (kvm_is_device_pfn(pfn)) { - mem_type = PAGE_S2_DEVICE; - flags |= KVM_S2PTE_FLAG_IS_IOMAP; - } else if (logging_active) { - /* - * Faults on pages in a memslot with logging enabled - * should not be mapped with huge pages (it introduces churn - * and performance degradation), so force a pte mapping. - */ - flags |= KVM_S2_FLAG_LOGGING_ACTIVE; - - /* - * Only actually map the page as writable if this was a write - * fault. - */ - if (!write_fault) - writable = false; - } - - spin_lock(&kvm->mmu_lock); - if (mmu_notifier_retry(kvm, mmu_seq)) - goto out_unlock; - - if (vma_pagesize == PAGE_SIZE && !force_pte) { - /* - * Only PMD_SIZE transparent hugepages(THP) are - * currently supported. This code will need to be - * updated to support other THP sizes. - */ - if (transparent_hugepage_adjust(&pfn, &fault_ipa)) - vma_pagesize = PMD_SIZE; - } - - if (writable) - kvm_set_pfn_dirty(pfn); - - if (fault_status != FSC_PERM) - clean_dcache_guest_page(pfn, vma_pagesize); - - if (exec_fault) - invalidate_icache_guest_page(pfn, vma_pagesize); - - /* - * If we took an execution fault we have made the - * icache/dcache coherent above and should now let the s2 - * mapping be executable. - * - * Write faults (!exec_fault && FSC_PERM) are orthogonal to - * execute permissions, and we preserve whatever we have. - */ - needs_exec = exec_fault || - (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); - - if (vma_pagesize == PUD_SIZE) { - pud_t new_pud = kvm_pfn_pud(pfn, mem_type); - - new_pud = kvm_pud_mkhuge(new_pud); - if (writable) - new_pud = kvm_s2pud_mkwrite(new_pud); - - if (needs_exec) - new_pud = kvm_s2pud_mkexec(new_pud); - - ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); - } else if (vma_pagesize == PMD_SIZE) { - pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); - - new_pmd = kvm_pmd_mkhuge(new_pmd); - - if (writable) - new_pmd = kvm_s2pmd_mkwrite(new_pmd); - - if (needs_exec) - new_pmd = kvm_s2pmd_mkexec(new_pmd); - - ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); - } else { - pte_t new_pte = kvm_pfn_pte(pfn, mem_type); - - if (writable) { - new_pte = kvm_s2pte_mkwrite(new_pte); - mark_page_dirty(kvm, gfn); - } - - if (needs_exec) - new_pte = kvm_s2pte_mkexec(new_pte); - - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); - } - -out_unlock: - spin_unlock(&kvm->mmu_lock); - kvm_set_pfn_accessed(pfn); - kvm_release_pfn_clean(pfn); - return ret; -} - -/* - * Resolve the access fault by making the page young again. - * Note that because the faulting entry is guaranteed not to be - * cached in the TLB, we don't need to invalidate anything. - * Only the HW Access Flag updates are supported for Stage 2 (no DBM), - * so there is no need for atomic (pte|pmd)_mkyoung operations. - */ -static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - kvm_pfn_t pfn; - bool pfn_valid = false; - - trace_kvm_access_fault(fault_ipa); - - spin_lock(&vcpu->kvm->mmu_lock); - - if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) - goto out; - - if (pud) { /* HugeTLB */ - *pud = kvm_s2pud_mkyoung(*pud); - pfn = kvm_pud_pfn(*pud); - pfn_valid = true; - } else if (pmd) { /* THP, HugeTLB */ - *pmd = pmd_mkyoung(*pmd); - pfn = pmd_pfn(*pmd); - pfn_valid = true; - } else { - *pte = pte_mkyoung(*pte); /* Just a page... */ - pfn = pte_pfn(*pte); - pfn_valid = true; - } - -out: - spin_unlock(&vcpu->kvm->mmu_lock); - if (pfn_valid) - kvm_set_pfn_accessed(pfn); -} - -/** - * kvm_handle_guest_abort - handles all 2nd stage aborts - * @vcpu: the VCPU pointer - * @run: the kvm_run structure - * - * Any abort that gets to the host is almost guaranteed to be caused by a - * missing second stage translation table entry, which can mean that either the - * guest simply needs more memory and we must allocate an appropriate page or it - * can mean that the guest tried to access I/O memory, which is emulated by user - * space. The distinction is based on the IPA causing the fault and whether this - * memory region has been registered as standard RAM by user space. - */ -int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - unsigned long fault_status; - phys_addr_t fault_ipa; - struct kvm_memory_slot *memslot; - unsigned long hva; - bool is_iabt, write_fault, writable; - gfn_t gfn; - int ret, idx; - - fault_status = kvm_vcpu_trap_get_fault_type(vcpu); - - fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); - is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - - /* Synchronous External Abort? */ - if (kvm_vcpu_dabt_isextabt(vcpu)) { - /* - * For RAS the host kernel may handle this abort. - * There is no need to pass the error into the guest. - */ - if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) - return 1; - - if (unlikely(!is_iabt)) { - kvm_inject_vabt(vcpu); - return 1; - } - } - - trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), - kvm_vcpu_get_hfar(vcpu), fault_ipa); - - /* Check the stage-2 fault is trans. fault or write fault */ - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && - fault_status != FSC_ACCESS) { - kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", - kvm_vcpu_trap_get_class(vcpu), - (unsigned long)kvm_vcpu_trap_get_fault(vcpu), - (unsigned long)kvm_vcpu_get_hsr(vcpu)); - return -EFAULT; - } - - idx = srcu_read_lock(&vcpu->kvm->srcu); - - gfn = fault_ipa >> PAGE_SHIFT; - memslot = gfn_to_memslot(vcpu->kvm, gfn); - hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); - write_fault = kvm_is_write_fault(vcpu); - if (kvm_is_error_hva(hva) || (write_fault && !writable)) { - if (is_iabt) { - /* Prefetch Abort on I/O address */ - kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - ret = 1; - goto out_unlock; - } - - /* - * Check for a cache maintenance operation. Since we - * ended-up here, we know it is outside of any memory - * slot. But we can't find out if that is for a device, - * or if the guest is just being stupid. The only thing - * we know for sure is that this range cannot be cached. - * - * So let's assume that the guest is just being - * cautious, and skip the instruction. - */ - if (kvm_vcpu_dabt_is_cm(vcpu)) { - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - ret = 1; - goto out_unlock; - } - - /* - * The IPA is reported as [MAX:12], so we need to - * complement it with the bottom 12 bits from the - * faulting VA. This is always 12 bits, irrespective - * of the page size. - */ - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, run, fault_ipa); - goto out_unlock; - } - - /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); - - if (fault_status == FSC_ACCESS) { - handle_access_fault(vcpu, fault_ipa); - ret = 1; - goto out_unlock; - } - - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); - if (ret == 0) - ret = 1; -out_unlock: - srcu_read_unlock(&vcpu->kvm->srcu, idx); - return ret; -} - -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, - gpa_t gpa, u64 size, - void *data), - void *data) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gpa; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; - ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); - } - - return ret; -} - -static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - unmap_stage2_range(kvm, gpa, size); - return 0; -} - -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end) -{ - if (!kvm->arch.pgd) - return 0; - - trace_kvm_unmap_hva_range(start, end); - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); - return 0; -} - -static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - pte_t *pte = (pte_t *)data; - - WARN_ON(size != PAGE_SIZE); - /* - * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE - * flag clear because MMU notifiers will have unmapped a huge PMD before - * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and - * therefore stage2_set_pte() never needs to clear out a huge PMD - * through this calling path. - */ - stage2_set_pte(kvm, NULL, gpa, pte, 0); - return 0; -} - - -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) -{ - unsigned long end = hva + PAGE_SIZE; - kvm_pfn_t pfn = pte_pfn(pte); - pte_t stage2_pte; - - if (!kvm->arch.pgd) - return 0; - - trace_kvm_set_spte_hva(hva); - - /* - * We've moved a page around, probably through CoW, so let's treat it - * just like a translation fault and clean the cache to the PoC. - */ - clean_dcache_guest_page(pfn, PAGE_SIZE); - stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); - - return 0; -} - -static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return stage2_pudp_test_and_clear_young(pud); - else if (pmd) - return stage2_pmdp_test_and_clear_young(pmd); - else - return stage2_ptep_test_and_clear_young(pte); -} - -static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return kvm_s2pud_young(*pud); - else if (pmd) - return pmd_young(*pmd); - else - return pte_young(*pte); -} - -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) -{ - if (!kvm->arch.pgd) - return 0; - trace_kvm_age_hva(start, end); - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); -} - -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - if (!kvm->arch.pgd) - return 0; - trace_kvm_test_age_hva(hva); - return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); -} - -void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) -{ - mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); -} - -phys_addr_t kvm_mmu_get_httbr(void) -{ - if (__kvm_cpu_uses_extended_idmap()) - return virt_to_phys(merged_hyp_pgd); - else - return virt_to_phys(hyp_pgd); -} - -phys_addr_t kvm_get_idmap_vector(void) -{ - return hyp_idmap_vector; -} - -static int kvm_map_idmap_text(pgd_t *pgd) -{ - int err; - - /* Create the idmap in the boot page tables */ - err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - hyp_idmap_start, hyp_idmap_end, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP_EXEC); - if (err) - kvm_err("Failed to idmap %lx-%lx\n", - hyp_idmap_start, hyp_idmap_end); - - return err; -} - -int kvm_mmu_init(void) -{ - int err; - - hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); - hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); - hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); - hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); - hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); - - /* - * We rely on the linker script to ensure at build time that the HYP - * init code does not cross a page boundary. - */ - BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); - - kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); - kvm_debug("HYP VA range: %lx:%lx\n", - kern_hyp_va(PAGE_OFFSET), - kern_hyp_va((unsigned long)high_memory - 1)); - - if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && - hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && - hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { - /* - * The idmap page is intersecting with the VA space, - * it is not safe to continue further. - */ - kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); - err = -EINVAL; - goto out; - } - - hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); - if (!hyp_pgd) { - kvm_err("Hyp mode PGD not allocated\n"); - err = -ENOMEM; - goto out; - } - - if (__kvm_cpu_uses_extended_idmap()) { - boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - hyp_pgd_order); - if (!boot_hyp_pgd) { - kvm_err("Hyp boot PGD not allocated\n"); - err = -ENOMEM; - goto out; - } - - err = kvm_map_idmap_text(boot_hyp_pgd); - if (err) - goto out; - - merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - if (!merged_hyp_pgd) { - kvm_err("Failed to allocate extra HYP pgd\n"); - goto out; - } - __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, - hyp_idmap_start); - } else { - err = kvm_map_idmap_text(hyp_pgd); - if (err) - goto out; - } - - io_map_base = hyp_idmap_start; - return 0; -out: - free_hyp_pgds(); - return err; -} - -void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, - const struct kvm_memory_slot *new, - enum kvm_mr_change change) -{ - /* - * At this point memslot has been committed and there is an - * allocated dirty_bitmap[], dirty pages will be be tracked while the - * memory slot is write protected. - */ - if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) - kvm_mmu_wp_memory_region(kvm, mem->slot); -} - -int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem, - enum kvm_mr_change change) -{ - hva_t hva = mem->userspace_addr; - hva_t reg_end = hva + mem->memory_size; - bool writable = !(mem->flags & KVM_MEM_READONLY); - int ret = 0; - - if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && - change != KVM_MR_FLAGS_ONLY) - return 0; - - /* - * Prevent userspace from creating a memory region outside of the IPA - * space addressable by the KVM guest IPA space. - */ - if (memslot->base_gfn + memslot->npages >= - (kvm_phys_size(kvm) >> PAGE_SHIFT)) - return -EFAULT; - - down_read(¤t->mm->mmap_sem); - /* - * A memory region could potentially cover multiple VMAs, and any holes - * between them, so iterate over all of them to find out if we can map - * any of them right now. - * - * +--------------------------------------------+ - * +---------------+----------------+ +----------------+ - * | : VMA 1 | VMA 2 | | VMA 3 : | - * +---------------+----------------+ +----------------+ - * | memory region | - * +--------------------------------------------+ - */ - do { - struct vm_area_struct *vma = find_vma(current->mm, hva); - hva_t vm_start, vm_end; - - if (!vma || vma->vm_start >= reg_end) - break; - - /* - * Mapping a read-only VMA is only allowed if the - * memory region is configured as read-only. - */ - if (writable && !(vma->vm_flags & VM_WRITE)) { - ret = -EPERM; - break; - } - - /* - * Take the intersection of this VMA with the memory region - */ - vm_start = max(hva, vma->vm_start); - vm_end = min(reg_end, vma->vm_end); - - if (vma->vm_flags & VM_PFNMAP) { - gpa_t gpa = mem->guest_phys_addr + - (vm_start - mem->userspace_addr); - phys_addr_t pa; - - pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; - pa += vm_start - vma->vm_start; - - /* IO region dirty page logging not allowed */ - if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { - ret = -EINVAL; - goto out; - } - - ret = kvm_phys_addr_ioremap(kvm, gpa, pa, - vm_end - vm_start, - writable); - if (ret) - break; - } - hva = vm_end; - } while (hva < reg_end); - - if (change == KVM_MR_FLAGS_ONLY) - goto out; - - spin_lock(&kvm->mmu_lock); - if (ret) - unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); - else - stage2_flush_memslot(kvm, memslot); - spin_unlock(&kvm->mmu_lock); -out: - up_read(¤t->mm->mmap_sem); - return ret; -} - -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ -} - -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) -{ - return 0; -} - -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) -{ -} - -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_free_stage2_pgd(kvm); -} - -void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - gpa_t gpa = slot->base_gfn << PAGE_SHIFT; - phys_addr_t size = slot->npages << PAGE_SHIFT; - - spin_lock(&kvm->mmu_lock); - unmap_stage2_range(kvm, gpa, size); - spin_unlock(&kvm->mmu_lock); -} - -/* - * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). - * - * Main problems: - * - S/W ops are local to a CPU (not broadcast) - * - We have line migration behind our back (speculation) - * - System caches don't support S/W at all (damn!) - * - * In the face of the above, the best we can do is to try and convert - * S/W ops to VA ops. Because the guest is not allowed to infer the - * S/W to PA mapping, it can only use S/W to nuke the whole cache, - * which is a rather good thing for us. - * - * Also, it is only used when turning caches on/off ("The expected - * usage of the cache maintenance instructions that operate by set/way - * is associated with the cache maintenance instructions associated - * with the powerdown and powerup of caches, if this is required by - * the implementation."). - * - * We use the following policy: - * - * - If we trap a S/W operation, we enable VM trapping to detect - * caches being turned on/off, and do a full clean. - * - * - We flush the caches on both caches being turned on and off. - * - * - Once the caches are enabled, we stop trapping VM ops. - */ -void kvm_set_way_flush(struct kvm_vcpu *vcpu) -{ - unsigned long hcr = *vcpu_hcr(vcpu); - - /* - * If this is the first time we do a S/W operation - * (i.e. HCR_TVM not set) flush the whole memory, and set the - * VM trapping. - * - * Otherwise, rely on the VM trapping to wait for the MMU + - * Caches to be turned off. At that point, we'll be able to - * clean the caches again. - */ - if (!(hcr & HCR_TVM)) { - trace_kvm_set_way_flush(*vcpu_pc(vcpu), - vcpu_has_cache_enabled(vcpu)); - stage2_flush_vm(vcpu->kvm); - *vcpu_hcr(vcpu) = hcr | HCR_TVM; - } -} - -void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) -{ - bool now_enabled = vcpu_has_cache_enabled(vcpu); - - /* - * If switching the MMU+caches on, need to invalidate the caches. - * If switching it off, need to clean the caches. - * Clean + invalidate does the trick always. - */ - if (now_enabled != was_enabled) - stage2_flush_vm(vcpu->kvm); - - /* Caches are now on, stop trapping VM ops (until a S/W op) */ - if (now_enabled) - *vcpu_hcr(vcpu) &= ~HCR_TVM; - - trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); -} diff --git a/virt/kvm/arm/perf.c b/virt/kvm/arm/perf.c deleted file mode 100644 index 1a3849da0b4b..000000000000 --- a/virt/kvm/arm/perf.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Based on the x86 implementation. - * - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/perf_event.h> -#include <linux/kvm_host.h> - -#include <asm/kvm_emulate.h> - -static int kvm_is_in_guest(void) -{ - return kvm_arm_get_running_vcpu() != NULL; -} - -static int kvm_is_user_mode(void) -{ - struct kvm_vcpu *vcpu; - - vcpu = kvm_arm_get_running_vcpu(); - - if (vcpu) - return !vcpu_mode_priv(vcpu); - - return 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - struct kvm_vcpu *vcpu; - - vcpu = kvm_arm_get_running_vcpu(); - - if (vcpu) - return *vcpu_pc(vcpu); - - return 0; -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, -}; - -int kvm_perf_init(void) -{ - return perf_register_guest_info_callbacks(&kvm_guest_cbs); -} - -int kvm_perf_teardown(void) -{ - return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); -} diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c deleted file mode 100644 index 1c5b76c46e26..000000000000 --- a/virt/kvm/arm/pmu.c +++ /dev/null @@ -1,606 +0,0 @@ -/* - * Copyright (C) 2015 Linaro Ltd. - * Author: Shannon Zhao <shannon.zhao@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/cpu.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/perf_event.h> -#include <linux/uaccess.h> -#include <asm/kvm_emulate.h> -#include <kvm/arm_pmu.h> -#include <kvm/arm_vgic.h> - -/** - * kvm_pmu_get_counter_value - get PMU counter value - * @vcpu: The vcpu pointer - * @select_idx: The counter index - */ -u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) -{ - u64 counter, reg, enabled, running; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; - - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; - counter = __vcpu_sys_reg(vcpu, reg); - - /* The real counter value is equal to the value of counter register plus - * the value perf event counts. - */ - if (pmc->perf_event) - counter += perf_event_read_value(pmc->perf_event, &enabled, - &running); - - return counter & pmc->bitmask; -} - -/** - * kvm_pmu_set_counter_value - set PMU counter value - * @vcpu: The vcpu pointer - * @select_idx: The counter index - * @val: The counter value - */ -void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) -{ - u64 reg; - - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; - __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); -} - -/** - * kvm_pmu_stop_counter - stop PMU counter - * @pmc: The PMU counter pointer - * - * If this counter has been configured to monitor some event, release it here. - */ -static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) -{ - u64 counter, reg; - - if (pmc->perf_event) { - counter = kvm_pmu_get_counter_value(vcpu, pmc->idx); - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; - __vcpu_sys_reg(vcpu, reg) = counter; - perf_event_disable(pmc->perf_event); - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - } -} - -/** - * kvm_pmu_vcpu_reset - reset pmu state for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); - pmu->pmc[i].idx = i; - pmu->pmc[i].bitmask = 0xffffffffUL; - } -} - -/** - * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc = &pmu->pmc[i]; - - if (pmc->perf_event) { - perf_event_disable(pmc->perf_event); - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - } - } -} - -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) -{ - u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT; - - val &= ARMV8_PMU_PMCR_N_MASK; - if (val == 0) - return BIT(ARMV8_PMU_CYCLE_IDX); - else - return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); -} - -/** - * kvm_pmu_enable_counter - enable selected PMU counter - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENSET register - * - * Call perf_event_enable to start counting the perf event - */ -void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) - return; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - if (!(val & BIT(i))) - continue; - - pmc = &pmu->pmc[i]; - if (pmc->perf_event) { - perf_event_enable(pmc->perf_event); - if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) - kvm_debug("fail to enable perf event\n"); - } - } -} - -/** - * kvm_pmu_disable_counter - disable selected PMU counter - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENCLR register - * - * Call perf_event_disable to stop counting the perf event - */ -void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u64 val) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - - if (!val) - return; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - if (!(val & BIT(i))) - continue; - - pmc = &pmu->pmc[i]; - if (pmc->perf_event) - perf_event_disable(pmc->perf_event); - } -} - -static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) -{ - u64 reg = 0; - - if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) { - reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); - reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); - reg &= kvm_pmu_valid_counter_mask(vcpu); - } - - return reg; -} - -static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - bool overflow; - - if (!kvm_arm_pmu_v3_ready(vcpu)) - return; - - overflow = !!kvm_pmu_overflow_status(vcpu); - if (pmu->irq_level == overflow) - return; - - pmu->irq_level = overflow; - - if (likely(irqchip_in_kernel(vcpu->kvm))) { - int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - pmu->irq_num, overflow, pmu); - WARN_ON(ret); - } -} - -bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_sync_regs *sregs = &vcpu->run->s.regs; - bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; - - if (likely(irqchip_in_kernel(vcpu->kvm))) - return false; - - return pmu->irq_level != run_level; -} - -/* - * Reflect the PMU overflow interrupt output level into the kvm_run structure - */ -void kvm_pmu_update_run(struct kvm_vcpu *vcpu) -{ - struct kvm_sync_regs *regs = &vcpu->run->s.regs; - - /* Populate the timer bitmap for user space */ - regs->device_irq_level &= ~KVM_ARM_DEV_PMU; - if (vcpu->arch.pmu.irq_level) - regs->device_irq_level |= KVM_ARM_DEV_PMU; -} - -/** - * kvm_pmu_flush_hwstate - flush pmu state to cpu - * @vcpu: The vcpu pointer - * - * Check if the PMU has overflowed while we were running in the host, and inject - * an interrupt if that was the case. - */ -void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) -{ - kvm_pmu_update_state(vcpu); -} - -/** - * kvm_pmu_sync_hwstate - sync pmu state from cpu - * @vcpu: The vcpu pointer - * - * Check if the PMU has overflowed while we were running in the guest, and - * inject an interrupt if that was the case. - */ -void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) -{ - kvm_pmu_update_state(vcpu); -} - -static inline struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu; - struct kvm_vcpu_arch *vcpu_arch; - - pmc -= pmc->idx; - pmu = container_of(pmc, struct kvm_pmu, pmc[0]); - vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu); - return container_of(vcpu_arch, struct kvm_vcpu, arch); -} - -/** - * When the perf event overflows, set the overflow status and inform the vcpu. - */ -static void kvm_pmu_perf_overflow(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - int idx = pmc->idx; - - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); - - if (kvm_pmu_overflow_status(vcpu)) { - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - } -} - -/** - * kvm_pmu_software_increment - do software increment - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMSWINC register - */ -void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) -{ - int i; - u64 type, enable, reg; - - if (val == 0) - return; - - enable = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { - if (!(val & BIT(i))) - continue; - type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i) - & ARMV8_PMU_EVTYPE_EVENT; - if ((type == ARMV8_PMUV3_PERFCTR_SW_INCR) - && (enable & BIT(i))) { - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; - reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; - if (!reg) - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); - } - } -} - -/** - * kvm_pmu_handle_pmcr - handle PMCR register - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCR register - */ -void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - u64 mask; - int i; - - mask = kvm_pmu_valid_counter_mask(vcpu); - if (val & ARMV8_PMU_PMCR_E) { - kvm_pmu_enable_counter(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); - } else { - kvm_pmu_disable_counter(vcpu, mask); - } - - if (val & ARMV8_PMU_PMCR_C) - kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); - - if (val & ARMV8_PMU_PMCR_P) { - for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) - kvm_pmu_set_counter_value(vcpu, i, 0); - } - - if (val & ARMV8_PMU_PMCR_LC) { - pmc = &pmu->pmc[ARMV8_PMU_CYCLE_IDX]; - pmc->bitmask = 0xffffffffffffffffUL; - } -} - -static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx) -{ - return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && - (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx)); -} - -/** - * kvm_pmu_set_counter_event_type - set selected counter to monitor some event - * @vcpu: The vcpu pointer - * @data: The data guest writes to PMXEVTYPER_EL0 - * @select_idx: The number of selected counter - * - * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an - * event with given hardware event number. Here we call perf_event API to - * emulate this action and create a kernel perf event for it. - */ -void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, - u64 select_idx) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; - struct perf_event *event; - struct perf_event_attr attr; - u64 eventsel, counter; - - kvm_pmu_stop_counter(vcpu, pmc); - eventsel = data & ARMV8_PMU_EVTYPE_EVENT; - - /* Software increment event does't need to be backed by a perf event */ - if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && - select_idx != ARMV8_PMU_CYCLE_IDX) - return; - - memset(&attr, 0, sizeof(struct perf_event_attr)); - attr.type = PERF_TYPE_RAW; - attr.size = sizeof(attr); - attr.pinned = 1; - attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, select_idx); - attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0; - attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; - attr.exclude_hv = 1; /* Don't count EL2 events */ - attr.exclude_host = 1; /* Don't count host events */ - attr.config = (select_idx == ARMV8_PMU_CYCLE_IDX) ? - ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel; - - counter = kvm_pmu_get_counter_value(vcpu, select_idx); - /* The initial sample period (overflow count) of an event. */ - attr.sample_period = (-counter) & pmc->bitmask; - - event = perf_event_create_kernel_counter(&attr, -1, current, - kvm_pmu_perf_overflow, pmc); - if (IS_ERR(event)) { - pr_err_once("kvm: pmu event creation failed %ld\n", - PTR_ERR(event)); - return; - } - - pmc->perf_event = event; -} - -bool kvm_arm_support_pmu_v3(void) -{ - /* - * Check if HW_PERF_EVENTS are supported by checking the number of - * hardware performance counters. This could ensure the presence of - * a physical PMU and CONFIG_PERF_EVENT is selected. - */ - return (perf_num_counters() > 0); -} - -int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.pmu.created) - return 0; - - /* - * A valid interrupt configuration for the PMU is either to have a - * properly configured interrupt number and using an in-kernel - * irqchip, or to not have an in-kernel GIC and not set an IRQ. - */ - if (irqchip_in_kernel(vcpu->kvm)) { - int irq = vcpu->arch.pmu.irq_num; - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -EINVAL; - - /* - * If we are using an in-kernel vgic, at this point we know - * the vgic will be initialized, so we can check the PMU irq - * number against the dimensions of the vgic and make sure - * it's valid. - */ - if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) - return -EINVAL; - } else if (kvm_arm_pmu_irq_initialized(vcpu)) { - return -EINVAL; - } - - kvm_pmu_vcpu_reset(vcpu); - vcpu->arch.pmu.ready = true; - - return 0; -} - -static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) -{ - if (!kvm_arm_support_pmu_v3()) - return -ENODEV; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENXIO; - - if (vcpu->arch.pmu.created) - return -EBUSY; - - if (irqchip_in_kernel(vcpu->kvm)) { - int ret; - - /* - * If using the PMU with an in-kernel virtual GIC - * implementation, we require the GIC to be already - * initialized when initializing the PMU. - */ - if (!vgic_initialized(vcpu->kvm)) - return -ENODEV; - - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -ENXIO; - - ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, - &vcpu->arch.pmu); - if (ret) - return ret; - } - - vcpu->arch.pmu.created = true; - return 0; -} - -/* - * For one VM the interrupt type must be same for each vcpu. - * As a PPI, the interrupt number is the same for all vcpus, - * while as an SPI it must be a separate number per vcpu. - */ -static bool pmu_irq_is_valid(struct kvm *kvm, int irq) -{ - int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_arm_pmu_irq_initialized(vcpu)) - continue; - - if (irq_is_ppi(irq)) { - if (vcpu->arch.pmu.irq_num != irq) - return false; - } else { - if (vcpu->arch.pmu.irq_num == irq) - return false; - } - } - - return true; -} - -int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: { - int __user *uaddr = (int __user *)(long)attr->addr; - int irq; - - if (!irqchip_in_kernel(vcpu->kvm)) - return -EINVAL; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENODEV; - - if (get_user(irq, uaddr)) - return -EFAULT; - - /* The PMU overflow interrupt can be a PPI or a valid SPI. */ - if (!(irq_is_ppi(irq) || irq_is_spi(irq))) - return -EINVAL; - - if (!pmu_irq_is_valid(vcpu->kvm, irq)) - return -EINVAL; - - if (kvm_arm_pmu_irq_initialized(vcpu)) - return -EBUSY; - - kvm_debug("Set kvm ARM PMU irq: %d\n", irq); - vcpu->arch.pmu.irq_num = irq; - return 0; - } - case KVM_ARM_VCPU_PMU_V3_INIT: - return kvm_arm_pmu_v3_init(vcpu); - } - - return -ENXIO; -} - -int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: { - int __user *uaddr = (int __user *)(long)attr->addr; - int irq; - - if (!irqchip_in_kernel(vcpu->kvm)) - return -EINVAL; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENODEV; - - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -ENXIO; - - irq = vcpu->arch.pmu.irq_num; - return put_user(irq, uaddr); - } - } - - return -ENXIO; -} - -int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: - case KVM_ARM_VCPU_PMU_V3_INIT: - if (kvm_arm_support_pmu_v3() && - test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return 0; - } - - return -ENXIO; -} diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c deleted file mode 100644 index 9b73d3ad918a..000000000000 --- a/virt/kvm/arm/psci.c +++ /dev/null @@ -1,503 +0,0 @@ -/* - * Copyright (C) 2012 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/arm-smccc.h> -#include <linux/preempt.h> -#include <linux/kvm_host.h> -#include <linux/uaccess.h> -#include <linux/wait.h> - -#include <asm/cputype.h> -#include <asm/kvm_emulate.h> -#include <asm/kvm_host.h> - -#include <kvm/arm_psci.h> - -/* - * This is an implementation of the Power State Coordination Interface - * as described in ARM document number ARM DEN 0022A. - */ - -#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) - -static u32 smccc_get_function(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 0); -} - -static unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 1); -} - -static unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 2); -} - -static unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 3); -} - -static void smccc_set_retval(struct kvm_vcpu *vcpu, - unsigned long a0, - unsigned long a1, - unsigned long a2, - unsigned long a3) -{ - vcpu_set_reg(vcpu, 0, a0); - vcpu_set_reg(vcpu, 1, a1); - vcpu_set_reg(vcpu, 2, a2); - vcpu_set_reg(vcpu, 3, a3); -} - -static unsigned long psci_affinity_mask(unsigned long affinity_level) -{ - if (affinity_level <= 3) - return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level); - - return 0; -} - -static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) -{ - /* - * NOTE: For simplicity, we make VCPU suspend emulation to be - * same-as WFI (Wait-for-interrupt) emulation. - * - * This means for KVM the wakeup events are interrupts and - * this is consistent with intended use of StateID as described - * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A). - * - * Further, we also treat power-down request to be same as - * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2 - * specification (ARM DEN 0022A). This means all suspend states - * for KVM will preserve the register state. - */ - kvm_vcpu_block(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); - - return PSCI_RET_SUCCESS; -} - -static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) -{ - vcpu->arch.power_off = true; - kvm_make_request(KVM_REQ_SLEEP, vcpu); - kvm_vcpu_kick(vcpu); -} - -static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) -{ - struct kvm *kvm = source_vcpu->kvm; - struct kvm_vcpu *vcpu = NULL; - struct swait_queue_head *wq; - unsigned long cpu_id; - unsigned long context_id; - phys_addr_t target_pc; - - cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; - if (vcpu_mode_is_32bit(source_vcpu)) - cpu_id &= ~((u32) 0); - - vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); - - /* - * Make sure the caller requested a valid CPU and that the CPU is - * turned off. - */ - if (!vcpu) - return PSCI_RET_INVALID_PARAMS; - if (!vcpu->arch.power_off) { - if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1) - return PSCI_RET_ALREADY_ON; - else - return PSCI_RET_INVALID_PARAMS; - } - - target_pc = smccc_get_arg2(source_vcpu); - context_id = smccc_get_arg3(source_vcpu); - - kvm_reset_vcpu(vcpu); - - /* Gracefully handle Thumb2 entry point */ - if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { - target_pc &= ~((phys_addr_t) 1); - vcpu_set_thumb(vcpu); - } - - /* Propagate caller endianness */ - if (kvm_vcpu_is_be(source_vcpu)) - kvm_vcpu_set_be(vcpu); - - *vcpu_pc(vcpu) = target_pc; - /* - * NOTE: We always update r0 (or x0) because for PSCI v0.1 - * the general puspose registers are undefined upon CPU_ON. - */ - smccc_set_retval(vcpu, context_id, 0, 0, 0); - vcpu->arch.power_off = false; - smp_mb(); /* Make sure the above is visible */ - - wq = kvm_arch_vcpu_wq(vcpu); - swake_up_one(wq); - - return PSCI_RET_SUCCESS; -} - -static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) -{ - int i, matching_cpus = 0; - unsigned long mpidr; - unsigned long target_affinity; - unsigned long target_affinity_mask; - unsigned long lowest_affinity_level; - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *tmp; - - target_affinity = smccc_get_arg1(vcpu); - lowest_affinity_level = smccc_get_arg2(vcpu); - - /* Determine target affinity mask */ - target_affinity_mask = psci_affinity_mask(lowest_affinity_level); - if (!target_affinity_mask) - return PSCI_RET_INVALID_PARAMS; - - /* Ignore other bits of target affinity */ - target_affinity &= target_affinity_mask; - - /* - * If one or more VCPU matching target affinity are running - * then ON else OFF - */ - kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr_aff(tmp); - if ((mpidr & target_affinity_mask) == target_affinity) { - matching_cpus++; - if (!tmp->arch.power_off) - return PSCI_0_2_AFFINITY_LEVEL_ON; - } - } - - if (!matching_cpus) - return PSCI_RET_INVALID_PARAMS; - - return PSCI_0_2_AFFINITY_LEVEL_OFF; -} - -static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) -{ - int i; - struct kvm_vcpu *tmp; - - /* - * The KVM ABI specifies that a system event exit may call KVM_RUN - * again and may perform shutdown/reboot at a later time that when the - * actual request is made. Since we are implementing PSCI and a - * caller of PSCI reboot and shutdown expects that the system shuts - * down or reboots immediately, let's make sure that VCPUs are not run - * after this call is handled and before the VCPUs have been - * re-initialized. - */ - kvm_for_each_vcpu(i, tmp, vcpu->kvm) - tmp->arch.power_off = true; - kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); - - memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); - vcpu->run->system_event.type = type; - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; -} - -static void kvm_psci_system_off(struct kvm_vcpu *vcpu) -{ - kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN); -} - -static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) -{ - kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET); -} - -static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - u32 psci_fn = smccc_get_function(vcpu); - unsigned long val; - int ret = 1; - - switch (psci_fn) { - case PSCI_0_2_FN_PSCI_VERSION: - /* - * Bits[31:16] = Major Version = 0 - * Bits[15:0] = Minor Version = 2 - */ - val = KVM_ARM_PSCI_0_2; - break; - case PSCI_0_2_FN_CPU_SUSPEND: - case PSCI_0_2_FN64_CPU_SUSPEND: - val = kvm_psci_vcpu_suspend(vcpu); - break; - case PSCI_0_2_FN_CPU_OFF: - kvm_psci_vcpu_off(vcpu); - val = PSCI_RET_SUCCESS; - break; - case PSCI_0_2_FN_CPU_ON: - case PSCI_0_2_FN64_CPU_ON: - mutex_lock(&kvm->lock); - val = kvm_psci_vcpu_on(vcpu); - mutex_unlock(&kvm->lock); - break; - case PSCI_0_2_FN_AFFINITY_INFO: - case PSCI_0_2_FN64_AFFINITY_INFO: - val = kvm_psci_vcpu_affinity_info(vcpu); - break; - case PSCI_0_2_FN_MIGRATE_INFO_TYPE: - /* - * Trusted OS is MP hence does not require migration - * or - * Trusted OS is not present - */ - val = PSCI_0_2_TOS_MP; - break; - case PSCI_0_2_FN_SYSTEM_OFF: - kvm_psci_system_off(vcpu); - /* - * We should'nt be going back to guest VCPU after - * receiving SYSTEM_OFF request. - * - * If user space accidently/deliberately resumes - * guest VCPU after SYSTEM_OFF request then guest - * VCPU should see internal failure from PSCI return - * value. To achieve this, we preload r0 (or x0) with - * PSCI return value INTERNAL_FAILURE. - */ - val = PSCI_RET_INTERNAL_FAILURE; - ret = 0; - break; - case PSCI_0_2_FN_SYSTEM_RESET: - kvm_psci_system_reset(vcpu); - /* - * Same reason as SYSTEM_OFF for preloading r0 (or x0) - * with PSCI return value INTERNAL_FAILURE. - */ - val = PSCI_RET_INTERNAL_FAILURE; - ret = 0; - break; - default: - val = PSCI_RET_NOT_SUPPORTED; - break; - } - - smccc_set_retval(vcpu, val, 0, 0, 0); - return ret; -} - -static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu) -{ - u32 psci_fn = smccc_get_function(vcpu); - u32 feature; - unsigned long val; - int ret = 1; - - switch(psci_fn) { - case PSCI_0_2_FN_PSCI_VERSION: - val = KVM_ARM_PSCI_1_0; - break; - case PSCI_1_0_FN_PSCI_FEATURES: - feature = smccc_get_arg1(vcpu); - switch(feature) { - case PSCI_0_2_FN_PSCI_VERSION: - case PSCI_0_2_FN_CPU_SUSPEND: - case PSCI_0_2_FN64_CPU_SUSPEND: - case PSCI_0_2_FN_CPU_OFF: - case PSCI_0_2_FN_CPU_ON: - case PSCI_0_2_FN64_CPU_ON: - case PSCI_0_2_FN_AFFINITY_INFO: - case PSCI_0_2_FN64_AFFINITY_INFO: - case PSCI_0_2_FN_MIGRATE_INFO_TYPE: - case PSCI_0_2_FN_SYSTEM_OFF: - case PSCI_0_2_FN_SYSTEM_RESET: - case PSCI_1_0_FN_PSCI_FEATURES: - case ARM_SMCCC_VERSION_FUNC_ID: - val = 0; - break; - default: - val = PSCI_RET_NOT_SUPPORTED; - break; - } - break; - default: - return kvm_psci_0_2_call(vcpu); - } - - smccc_set_retval(vcpu, val, 0, 0, 0); - return ret; -} - -static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - u32 psci_fn = smccc_get_function(vcpu); - unsigned long val; - - switch (psci_fn) { - case KVM_PSCI_FN_CPU_OFF: - kvm_psci_vcpu_off(vcpu); - val = PSCI_RET_SUCCESS; - break; - case KVM_PSCI_FN_CPU_ON: - mutex_lock(&kvm->lock); - val = kvm_psci_vcpu_on(vcpu); - mutex_unlock(&kvm->lock); - break; - default: - val = PSCI_RET_NOT_SUPPORTED; - break; - } - - smccc_set_retval(vcpu, val, 0, 0, 0); - return 1; -} - -/** - * kvm_psci_call - handle PSCI call if r0 value is in range - * @vcpu: Pointer to the VCPU struct - * - * Handle PSCI calls from guests through traps from HVC instructions. - * The calling convention is similar to SMC calls to the secure world - * where the function number is placed in r0. - * - * This function returns: > 0 (success), 0 (success but exit to user - * space), and < 0 (errors) - * - * Errors: - * -EINVAL: Unrecognized PSCI function - */ -static int kvm_psci_call(struct kvm_vcpu *vcpu) -{ - switch (kvm_psci_version(vcpu, vcpu->kvm)) { - case KVM_ARM_PSCI_1_0: - return kvm_psci_1_0_call(vcpu); - case KVM_ARM_PSCI_0_2: - return kvm_psci_0_2_call(vcpu); - case KVM_ARM_PSCI_0_1: - return kvm_psci_0_1_call(vcpu); - default: - return -EINVAL; - }; -} - -int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) -{ - u32 func_id = smccc_get_function(vcpu); - u32 val = SMCCC_RET_NOT_SUPPORTED; - u32 feature; - - switch (func_id) { - case ARM_SMCCC_VERSION_FUNC_ID: - val = ARM_SMCCC_VERSION_1_1; - break; - case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: - feature = smccc_get_arg1(vcpu); - switch(feature) { - case ARM_SMCCC_ARCH_WORKAROUND_1: - if (kvm_arm_harden_branch_predictor()) - val = SMCCC_RET_SUCCESS; - break; - case ARM_SMCCC_ARCH_WORKAROUND_2: - switch (kvm_arm_have_ssbd()) { - case KVM_SSBD_FORCE_DISABLE: - case KVM_SSBD_UNKNOWN: - break; - case KVM_SSBD_KERNEL: - val = SMCCC_RET_SUCCESS; - break; - case KVM_SSBD_FORCE_ENABLE: - case KVM_SSBD_MITIGATED: - val = SMCCC_RET_NOT_REQUIRED; - break; - } - break; - } - break; - default: - return kvm_psci_call(vcpu); - } - - smccc_set_retval(vcpu, val, 0, 0, 0); - return 1; -} - -int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) -{ - return 1; /* PSCI version */ -} - -int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices)) - return -EFAULT; - - return 0; -} - -int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - if (reg->id == KVM_REG_ARM_PSCI_VERSION) { - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - - val = kvm_psci_version(vcpu, vcpu->kvm); - if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - return 0; - } - - return -EINVAL; -} - -int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - if (reg->id == KVM_REG_ARM_PSCI_VERSION) { - void __user *uaddr = (void __user *)(long)reg->addr; - bool wants_02; - u64 val; - - if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); - - switch (val) { - case KVM_ARM_PSCI_0_1: - if (wants_02) - return -EINVAL; - vcpu->kvm->arch.psci_version = val; - return 0; - case KVM_ARM_PSCI_0_2: - case KVM_ARM_PSCI_1_0: - if (!wants_02) - return -EINVAL; - vcpu->kvm->arch.psci_version = val; - return 0; - } - } - - return -EINVAL; -} diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h deleted file mode 100644 index 3828beab93f2..000000000000 --- a/virt/kvm/arm/trace.h +++ /dev/null @@ -1,273 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVM_H - -#include <linux/tracepoint.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -/* - * Tracepoints for entry/exit to guest - */ -TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned long vcpu_pc), - TP_ARGS(vcpu_pc), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - ), - - TP_printk("PC: 0x%08lx", __entry->vcpu_pc) -); - -TRACE_EVENT(kvm_exit, - TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), - TP_ARGS(ret, esr_ec, vcpu_pc), - - TP_STRUCT__entry( - __field( int, ret ) - __field( unsigned int, esr_ec ) - __field( unsigned long, vcpu_pc ) - ), - - TP_fast_assign( - __entry->ret = ARM_EXCEPTION_CODE(ret); - __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; - __entry->vcpu_pc = vcpu_pc; - ), - - TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", - __print_symbolic(__entry->ret, kvm_arm_exception_type), - __entry->esr_ec, - __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), - __entry->vcpu_pc) -); - -TRACE_EVENT(kvm_guest_fault, - TP_PROTO(unsigned long vcpu_pc, unsigned long hsr, - unsigned long hxfar, - unsigned long long ipa), - TP_ARGS(vcpu_pc, hsr, hxfar, ipa), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( unsigned long, hsr ) - __field( unsigned long, hxfar ) - __field( unsigned long long, ipa ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->hsr = hsr; - __entry->hxfar = hxfar; - __entry->ipa = ipa; - ), - - TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx", - __entry->ipa, __entry->hsr, - __entry->hxfar, __entry->vcpu_pc) -); - -TRACE_EVENT(kvm_access_fault, - TP_PROTO(unsigned long ipa), - TP_ARGS(ipa), - - TP_STRUCT__entry( - __field( unsigned long, ipa ) - ), - - TP_fast_assign( - __entry->ipa = ipa; - ), - - TP_printk("IPA: %lx", __entry->ipa) -); - -TRACE_EVENT(kvm_irq_line, - TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level), - TP_ARGS(type, vcpu_idx, irq_num, level), - - TP_STRUCT__entry( - __field( unsigned int, type ) - __field( int, vcpu_idx ) - __field( int, irq_num ) - __field( int, level ) - ), - - TP_fast_assign( - __entry->type = type; - __entry->vcpu_idx = vcpu_idx; - __entry->irq_num = irq_num; - __entry->level = level; - ), - - TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d", - (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" : - (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" : - (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN", - __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level) -); - -TRACE_EVENT(kvm_mmio_emulate, - TP_PROTO(unsigned long vcpu_pc, unsigned long instr, - unsigned long cpsr), - TP_ARGS(vcpu_pc, instr, cpsr), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( unsigned long, instr ) - __field( unsigned long, cpsr ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->instr = instr; - __entry->cpsr = cpsr; - ), - - TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)", - __entry->vcpu_pc, __entry->instr, __entry->cpsr) -); - -TRACE_EVENT(kvm_unmap_hva_range, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier unmap range: %#08lx -- %#08lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_set_spte_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva) -); - -TRACE_EVENT(kvm_age_hva, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier age hva: %#08lx -- %#08lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_test_age_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier test age hva: %#08lx", __entry->hva) -); - -TRACE_EVENT(kvm_set_way_flush, - TP_PROTO(unsigned long vcpu_pc, bool cache), - TP_ARGS(vcpu_pc, cache), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( bool, cache ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->cache = cache; - ), - - TP_printk("S/W flush at 0x%016lx (cache %s)", - __entry->vcpu_pc, __entry->cache ? "on" : "off") -); - -TRACE_EVENT(kvm_toggle_cache, - TP_PROTO(unsigned long vcpu_pc, bool was, bool now), - TP_ARGS(vcpu_pc, was, now), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( bool, was ) - __field( bool, now ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->was = was; - __entry->now = now; - ), - - TP_printk("VM op at 0x%016lx (cache was %s, now %s)", - __entry->vcpu_pc, __entry->was ? "on" : "off", - __entry->now ? "on" : "off") -); - -/* - * Tracepoints for arch_timer - */ -TRACE_EVENT(kvm_timer_update_irq, - TP_PROTO(unsigned long vcpu_id, __u32 irq, int level), - TP_ARGS(vcpu_id, irq, level), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_id ) - __field( __u32, irq ) - __field( int, level ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->irq = irq; - __entry->level = level; - ), - - TP_printk("VCPU: %ld, IRQ %d, level %d", - __entry->vcpu_id, __entry->irq, __entry->level) -); - -#endif /* _TRACE_KVM_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h deleted file mode 100644 index 55fed77a9f73..000000000000 --- a/virt/kvm/arm/vgic/trace.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_VGIC_H - -#include <linux/tracepoint.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -TRACE_EVENT(vgic_update_irq_pending, - TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), - TP_ARGS(vcpu_id, irq, level), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_id ) - __field( __u32, irq ) - __field( bool, level ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->irq = irq; - __entry->level = level; - ), - - TP_printk("VCPU: %ld, IRQ %d, level: %d", - __entry->vcpu_id, __entry->irq, __entry->level) -); - -#endif /* _TRACE_VGIC_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm/vgic -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c deleted file mode 100644 index 07aa900bac56..000000000000 --- a/virt/kvm/arm/vgic/vgic-debug.c +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (C) 2016 Linaro - * Author: Christoffer Dall <christoffer.dall@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/cpu.h> -#include <linux/debugfs.h> -#include <linux/interrupt.h> -#include <linux/kvm_host.h> -#include <linux/seq_file.h> -#include <kvm/arm_vgic.h> -#include <asm/kvm_mmu.h> -#include "vgic.h" - -/* - * Structure to control looping through the entire vgic state. We start at - * zero for each field and move upwards. So, if dist_id is 0 we print the - * distributor info. When dist_id is 1, we have already printed it and move - * on. - * - * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and - * so on. - */ -struct vgic_state_iter { - int nr_cpus; - int nr_spis; - int nr_lpis; - int dist_id; - int vcpu_id; - int intid; - int lpi_idx; - u32 *lpi_array; -}; - -static void iter_next(struct vgic_state_iter *iter) -{ - if (iter->dist_id == 0) { - iter->dist_id++; - return; - } - - iter->intid++; - if (iter->intid == VGIC_NR_PRIVATE_IRQS && - ++iter->vcpu_id < iter->nr_cpus) - iter->intid = 0; - - if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) { - if (iter->lpi_idx < iter->nr_lpis) - iter->intid = iter->lpi_array[iter->lpi_idx]; - iter->lpi_idx++; - } -} - -static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, - loff_t pos) -{ - int nr_cpus = atomic_read(&kvm->online_vcpus); - - memset(iter, 0, sizeof(*iter)); - - iter->nr_cpus = nr_cpus; - iter->nr_spis = kvm->arch.vgic.nr_spis; - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array); - if (iter->nr_lpis < 0) - iter->nr_lpis = 0; - } - - /* Fast forward to the right position if needed */ - while (pos--) - iter_next(iter); -} - -static bool end_of_vgic(struct vgic_state_iter *iter) -{ - return iter->dist_id > 0 && - iter->vcpu_id == iter->nr_cpus && - iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && - iter->lpi_idx > iter->nr_lpis; -} - -static void *vgic_debug_start(struct seq_file *s, loff_t *pos) -{ - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter; - - mutex_lock(&kvm->lock); - iter = kvm->arch.vgic.iter; - if (iter) { - iter = ERR_PTR(-EBUSY); - goto out; - } - - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) { - iter = ERR_PTR(-ENOMEM); - goto out; - } - - iter_init(kvm, iter, *pos); - kvm->arch.vgic.iter = iter; - - if (end_of_vgic(iter)) - iter = NULL; -out: - mutex_unlock(&kvm->lock); - return iter; -} - -static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter = kvm->arch.vgic.iter; - - ++*pos; - iter_next(iter); - if (end_of_vgic(iter)) - iter = NULL; - return iter; -} - -static void vgic_debug_stop(struct seq_file *s, void *v) -{ - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter; - - /* - * If the seq file wasn't properly opened, there's nothing to clearn - * up. - */ - if (IS_ERR(v)) - return; - - mutex_lock(&kvm->lock); - iter = kvm->arch.vgic.iter; - kfree(iter->lpi_array); - kfree(iter); - kvm->arch.vgic.iter = NULL; - mutex_unlock(&kvm->lock); -} - -static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) -{ - bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; - - seq_printf(s, "Distributor\n"); - seq_printf(s, "===========\n"); - seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); - seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); - if (v3) - seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count); - seq_printf(s, "enabled:\t%d\n", dist->enabled); - seq_printf(s, "\n"); - - seq_printf(s, "P=pending_latch, L=line_level, A=active\n"); - seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n"); - seq_printf(s, "G=group\n"); -} - -static void print_header(struct seq_file *s, struct vgic_irq *irq, - struct kvm_vcpu *vcpu) -{ - int id = 0; - char *hdr = "SPI "; - - if (vcpu) { - hdr = "VCPU"; - id = vcpu->vcpu_id; - } - - seq_printf(s, "\n"); - seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHCG HWID TARGET SRC PRI VCPU_ID\n", hdr, id); - seq_printf(s, "----------------------------------------------------------------\n"); -} - -static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, - struct kvm_vcpu *vcpu) -{ - char *type; - if (irq->intid < VGIC_NR_SGIS) - type = "SGI"; - else if (irq->intid < VGIC_NR_PRIVATE_IRQS) - type = "PPI"; - else if (irq->intid < VGIC_MAX_SPI) - type = "SPI"; - else - type = "LPI"; - - if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS) - print_header(s, irq, vcpu); - - seq_printf(s, " %s %4d " - " %2d " - "%d%d%d%d%d%d%d " - "%8d " - "%8x " - " %2x " - "%3d " - " %2d " - "\n", - type, irq->intid, - (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1, - irq->pending_latch, - irq->line_level, - irq->active, - irq->enabled, - irq->hw, - irq->config == VGIC_CONFIG_LEVEL, - irq->group, - irq->hwintid, - irq->mpidr, - irq->source, - irq->priority, - (irq->vcpu) ? irq->vcpu->vcpu_id : -1); -} - -static int vgic_debug_show(struct seq_file *s, void *v) -{ - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter = (struct vgic_state_iter *)v; - struct vgic_irq *irq; - struct kvm_vcpu *vcpu = NULL; - unsigned long flags; - - if (iter->dist_id == 0) { - print_dist_state(s, &kvm->arch.vgic); - return 0; - } - - if (!kvm->arch.vgic.initialized) - return 0; - - if (iter->vcpu_id < iter->nr_cpus) - vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); - - irq = vgic_get_irq(kvm, vcpu, iter->intid); - if (!irq) { - seq_printf(s, " LPI %4d freed\n", iter->intid); - return 0; - } - - spin_lock_irqsave(&irq->irq_lock, flags); - print_irq_state(s, irq, vcpu); - spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(kvm, irq); - return 0; -} - -static const struct seq_operations vgic_debug_seq_ops = { - .start = vgic_debug_start, - .next = vgic_debug_next, - .stop = vgic_debug_stop, - .show = vgic_debug_show -}; - -static int debug_open(struct inode *inode, struct file *file) -{ - int ret; - ret = seq_open(file, &vgic_debug_seq_ops); - if (!ret) { - struct seq_file *seq; - /* seq_open will have modified file->private_data */ - seq = file->private_data; - seq->private = inode->i_private; - } - - return ret; -}; - -static const struct file_operations vgic_debug_fops = { - .owner = THIS_MODULE, - .open = debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release -}; - -void vgic_debug_init(struct kvm *kvm) -{ - debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm, - &vgic_debug_fops); -} - -void vgic_debug_destroy(struct kvm *kvm) -{ -} diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c deleted file mode 100644 index c0c0b88af1d5..000000000000 --- a/virt/kvm/arm/vgic/vgic-init.c +++ /dev/null @@ -1,543 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/uaccess.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/kvm_host.h> -#include <kvm/arm_vgic.h> -#include <asm/kvm_mmu.h> -#include "vgic.h" - -/* - * Initialization rules: there are multiple stages to the vgic - * initialization, both for the distributor and the CPU interfaces. The basic - * idea is that even though the VGIC is not functional or not requested from - * user space, the critical path of the run loop can still call VGIC functions - * that just won't do anything, without them having to check additional - * initialization flags to ensure they don't look at uninitialized data - * structures. - * - * Distributor: - * - * - kvm_vgic_early_init(): initialization of static data that doesn't - * depend on any sizing information or emulation type. No allocation - * is allowed there. - * - * - vgic_init(): allocation and initialization of the generic data - * structures that depend on sizing information (number of CPUs, - * number of interrupts). Also initializes the vcpu specific data - * structures. Can be executed lazily for GICv2. - * - * CPU Interface: - * - * - kvm_vgic_vcpu_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. - */ - -/* EARLY INIT */ - -/** - * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures - * @kvm: The VM whose VGIC districutor should be initialized - * - * Only do initialization of static structures that don't require any - * allocation or sizing information from userspace. vgic_init() called - * kvm_vgic_dist_init() which takes care of the rest. - */ -void kvm_vgic_early_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - INIT_LIST_HEAD(&dist->lpi_list_head); - spin_lock_init(&dist->lpi_list_lock); -} - -/* CREATION */ - -/** - * kvm_vgic_create: triggered by the instantiation of the VGIC device by - * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) - * or through the generic KVM_CREATE_DEVICE API ioctl. - * irqchip_in_kernel() tells you if this function succeeded or not. - * @kvm: kvm struct pointer - * @type: KVM_DEV_TYPE_ARM_VGIC_V[23] - */ -int kvm_vgic_create(struct kvm *kvm, u32 type) -{ - int i, vcpu_lock_idx = -1, ret; - struct kvm_vcpu *vcpu; - - if (irqchip_in_kernel(kvm)) - return -EEXIST; - - /* - * This function is also called by the KVM_CREATE_IRQCHIP handler, - * which had no chance yet to check the availability of the GICv2 - * emulation. So check this here again. KVM_CREATE_DEVICE does - * the proper checks already. - */ - if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && - !kvm_vgic_global_state.can_emulate_gicv2) - return -ENODEV; - - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run while we create the vgic. - */ - ret = -EBUSY; - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!mutex_trylock(&vcpu->mutex)) - goto out_unlock; - vcpu_lock_idx = i; - } - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->arch.has_run_once) - goto out_unlock; - } - ret = 0; - - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) - kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS; - else - kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS; - - if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) { - ret = -E2BIG; - goto out_unlock; - } - - kvm->arch.vgic.in_kernel = true; - kvm->arch.vgic.vgic_model = type; - - kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) - kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - else - INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); - -out_unlock: - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); - mutex_unlock(&vcpu->mutex); - } - return ret; -} - -/* INIT/DESTROY */ - -/** - * kvm_vgic_dist_init: initialize the dist data structures - * @kvm: kvm struct pointer - * @nr_spis: number of spis, frozen by caller - */ -static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); - int i; - - dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); - if (!dist->spis) - return -ENOMEM; - - /* - * In the following code we do not take the irq struct lock since - * no other action on irq structs can happen while the VGIC is - * not initialized yet: - * If someone wants to inject an interrupt or does a MMIO access, we - * require prior initialization in case of a virtual GICv3 or trigger - * initialization when using a virtual GICv2. - */ - for (i = 0; i < nr_spis; i++) { - struct vgic_irq *irq = &dist->spis[i]; - - irq->intid = i + VGIC_NR_PRIVATE_IRQS; - INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); - irq->vcpu = NULL; - irq->target_vcpu = vcpu0; - kref_init(&irq->refcount); - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { - irq->targets = 0; - irq->group = 0; - } else { - irq->mpidr = 0; - irq->group = 1; - } - } - return 0; -} - -/** - * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data - * structures and register VCPU-specific KVM iodevs - * - * @vcpu: pointer to the VCPU being created and initialized - * - * Only do initialization, but do not actually enable the - * VGIC CPU interface - */ -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int ret = 0; - int i; - - vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; - vgic_cpu->sgi_iodev.base_addr = VGIC_ADDR_UNDEF; - - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - spin_lock_init(&vgic_cpu->ap_list_lock); - - /* - * Enable and configure all SGIs to be edge-triggered and - * configure all PPIs as level-triggered. - */ - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; - - INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); - irq->intid = i; - irq->vcpu = NULL; - irq->target_vcpu = vcpu; - irq->targets = 1U << vcpu->vcpu_id; - kref_init(&irq->refcount); - if (vgic_irq_is_sgi(i)) { - /* SGIs */ - irq->enabled = 1; - irq->config = VGIC_CONFIG_EDGE; - } else { - /* PPIs */ - irq->config = VGIC_CONFIG_LEVEL; - } - - /* - * GICv3 can only be created via the KVM_DEVICE_CREATE API and - * so we always know the emulation type at this point as it's - * either explicitly configured as GICv3, or explicitly - * configured as GICv2, or not configured yet which also - * implies GICv2. - */ - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) - irq->group = 1; - else - irq->group = 0; - } - - if (!irqchip_in_kernel(vcpu->kvm)) - return 0; - - /* - * If we are creating a VCPU with a GICv3 we must also register the - * KVM io device for the redistributor that belongs to this VCPU. - */ - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - mutex_lock(&vcpu->kvm->lock); - ret = vgic_register_redist_iodev(vcpu); - mutex_unlock(&vcpu->kvm->lock); - } - return ret; -} - -static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_enable(vcpu); - else - vgic_v3_enable(vcpu); -} - -/* - * vgic_init: allocates and initializes dist and vcpu data structures - * depending on two dimensioning parameters: - * - the number of spis - * - the number of vcpus - * The function is generally called when nr_spis has been explicitly set - * by the guest through the KVM DEVICE API. If not nr_spis is set to 256. - * vgic_initialized() returns true when this function has succeeded. - * Must be called with kvm->lock held! - */ -int vgic_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int ret = 0, i; - - if (vgic_initialized(kvm)) - return 0; - - /* Are we also in the middle of creating a VCPU? */ - if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) - return -EBUSY; - - /* freeze the number of spis */ - if (!dist->nr_spis) - dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; - - ret = kvm_vgic_dist_init(kvm, dist->nr_spis); - if (ret) - goto out; - - if (vgic_has_its(kvm)) { - ret = vgic_v4_init(kvm); - if (ret) - goto out; - } - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vgic_vcpu_enable(vcpu); - - ret = kvm_vgic_setup_default_irq_routing(kvm); - if (ret) - goto out; - - vgic_debug_init(kvm); - - dist->implementation_rev = 2; - dist->initialized = true; - -out: - return ret; -} - -static void kvm_vgic_dist_destroy(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_redist_region *rdreg, *next; - - dist->ready = false; - dist->initialized = false; - - kfree(dist->spis); - dist->spis = NULL; - dist->nr_spis = 0; - - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) { - list_del(&rdreg->list); - kfree(rdreg); - } - INIT_LIST_HEAD(&dist->rd_regions); - } - - if (vgic_supports_direct_msis(kvm)) - vgic_v4_teardown(kvm); -} - -void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); -} - -/* To be called with kvm->lock held */ -static void __kvm_vgic_destroy(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int i; - - vgic_debug_destroy(kvm); - - kvm_vgic_dist_destroy(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vgic_vcpu_destroy(vcpu); -} - -void kvm_vgic_destroy(struct kvm *kvm) -{ - mutex_lock(&kvm->lock); - __kvm_vgic_destroy(kvm); - mutex_unlock(&kvm->lock); -} - -/** - * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest - * is a GICv2. A GICv3 must be explicitly initialized by the guest using the - * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group. - * @kvm: kvm struct pointer - */ -int vgic_lazy_init(struct kvm *kvm) -{ - int ret = 0; - - if (unlikely(!vgic_initialized(kvm))) { - /* - * We only provide the automatic initialization of the VGIC - * for the legacy case of a GICv2. Any other type must - * be explicitly initialized once setup with the respective - * KVM device call. - */ - if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) - return -EBUSY; - - mutex_lock(&kvm->lock); - ret = vgic_init(kvm); - mutex_unlock(&kvm->lock); - } - - return ret; -} - -/* RESOURCE MAPPING */ - -/** - * Map the MMIO regions depending on the VGIC model exposed to the guest - * called on the first VCPU run. - * Also map the virtual CPU interface into the VM. - * v2/v3 derivatives call vgic_init if not already done. - * vgic_ready() returns true if this function has succeeded. - * @kvm: kvm struct pointer - */ -int kvm_vgic_map_resources(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int ret = 0; - - mutex_lock(&kvm->lock); - if (!irqchip_in_kernel(kvm)) - goto out; - - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) - ret = vgic_v2_map_resources(kvm); - else - ret = vgic_v3_map_resources(kvm); - - if (ret) - __kvm_vgic_destroy(kvm); - -out: - mutex_unlock(&kvm->lock); - return ret; -} - -/* GENERIC PROBE */ - -static int vgic_init_cpu_starting(unsigned int cpu) -{ - enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); - return 0; -} - - -static int vgic_init_cpu_dying(unsigned int cpu) -{ - disable_percpu_irq(kvm_vgic_global_state.maint_irq); - return 0; -} - -static irqreturn_t vgic_maintenance_handler(int irq, void *data) -{ - /* - * We cannot rely on the vgic maintenance interrupt to be - * delivered synchronously. This means we can only use it to - * exit the VM, and we perform the handling of EOIed - * interrupts on the exit path (see vgic_fold_lr_state). - */ - return IRQ_HANDLED; -} - -/** - * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware - * - * For a specific CPU, initialize the GIC VE hardware. - */ -void kvm_vgic_init_cpu_hardware(void) -{ - BUG_ON(preemptible()); - - /* - * We want to make sure the list registers start out clear so that we - * only have the program the used registers. - */ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_init_lrs(); - else - kvm_call_hyp(__vgic_v3_init_lrs); -} - -/** - * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable - * according to the host GIC model. Accordingly calls either - * vgic_v2/v3_probe which registers the KVM_DEVICE that can be - * instantiated by a guest later on . - */ -int kvm_vgic_hyp_init(void) -{ - const struct gic_kvm_info *gic_kvm_info; - int ret; - - gic_kvm_info = gic_get_kvm_info(); - if (!gic_kvm_info) - return -ENODEV; - - if (!gic_kvm_info->maint_irq) { - kvm_err("No vgic maintenance irq\n"); - return -ENXIO; - } - - switch (gic_kvm_info->type) { - case GIC_V2: - ret = vgic_v2_probe(gic_kvm_info); - break; - case GIC_V3: - ret = vgic_v3_probe(gic_kvm_info); - if (!ret) { - static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); - kvm_info("GIC system register CPU interface enabled\n"); - } - break; - default: - ret = -ENODEV; - }; - - if (ret) - return ret; - - kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq; - ret = request_percpu_irq(kvm_vgic_global_state.maint_irq, - vgic_maintenance_handler, - "vgic", kvm_get_running_vcpus()); - if (ret) { - kvm_err("Cannot register interrupt %d\n", - kvm_vgic_global_state.maint_irq); - return ret; - } - - ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, - "kvm/arm/vgic:starting", - vgic_init_cpu_starting, vgic_init_cpu_dying); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - - kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); - return 0; - -out_free_irq: - free_percpu_irq(kvm_vgic_global_state.maint_irq, - kvm_get_running_vcpus()); - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c deleted file mode 100644 index 99e026d2dade..000000000000 --- a/virt/kvm/arm/vgic/vgic-irqfd.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <trace/events/kvm.h> -#include <kvm/arm_vgic.h> -#include "vgic.h" - -/** - * vgic_irqfd_set_irq: inject the IRQ corresponding to the - * irqchip routing entry - * - * This is the entry point for irqfd IRQ injection - */ -static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, - int level, bool line_status) -{ - unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS; - - if (!vgic_valid_spi(kvm, spi_id)) - return -EINVAL; - return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL); -} - -/** - * kvm_set_routing_entry: populate a kvm routing entry - * from a user routing entry - * - * @kvm: the VM this entry is applied to - * @e: kvm kernel routing entry handle - * @ue: user api routing entry handle - * return 0 on success, -EINVAL on errors. - */ -int kvm_set_routing_entry(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *e, - const struct kvm_irq_routing_entry *ue) -{ - int r = -EINVAL; - - switch (ue->type) { - case KVM_IRQ_ROUTING_IRQCHIP: - e->set = vgic_irqfd_set_irq; - e->irqchip.irqchip = ue->u.irqchip.irqchip; - e->irqchip.pin = ue->u.irqchip.pin; - if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) || - (e->irqchip.irqchip >= KVM_NR_IRQCHIPS)) - goto out; - break; - case KVM_IRQ_ROUTING_MSI: - e->set = kvm_set_msi; - e->msi.address_lo = ue->u.msi.address_lo; - e->msi.address_hi = ue->u.msi.address_hi; - e->msi.data = ue->u.msi.data; - e->msi.flags = ue->flags; - e->msi.devid = ue->u.msi.devid; - break; - default: - goto out; - } - r = 0; -out: - return r; -} - -/** - * kvm_set_msi: inject the MSI corresponding to the - * MSI routing entry - * - * This is the entry point for irqfd MSI injection - * and userspace MSI injection. - */ -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, - int level, bool line_status) -{ - struct kvm_msi msi; - - msi.address_lo = e->msi.address_lo; - msi.address_hi = e->msi.address_hi; - msi.data = e->msi.data; - msi.flags = e->msi.flags; - msi.devid = e->msi.devid; - - if (!vgic_has_its(kvm)) - return -ENODEV; - - if (!level) - return -1; - - return vgic_its_inject_msi(kvm, &msi); -} - -int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) -{ - struct kvm_irq_routing_entry *entries; - struct vgic_dist *dist = &kvm->arch.vgic; - u32 nr = dist->nr_spis; - int i, ret; - - entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL); - if (!entries) - return -ENOMEM; - - for (i = 0; i < nr; i++) { - entries[i].gsi = i; - entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; - entries[i].u.irqchip.irqchip = 0; - entries[i].u.irqchip.pin = i; - } - ret = kvm_set_irq_routing(kvm, entries, nr, 0); - kfree(entries); - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c deleted file mode 100644 index eb2a390a6c86..000000000000 --- a/virt/kvm/arm/vgic/vgic-its.c +++ /dev/null @@ -1,2569 +0,0 @@ -/* - * GICv3 ITS emulation - * - * Copyright (C) 2015,2016 ARM Ltd. - * Author: Andre Przywara <andre.przywara@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/cpu.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/interrupt.h> -#include <linux/list.h> -#include <linux/uaccess.h> -#include <linux/list_sort.h> - -#include <linux/irqchip/arm-gic-v3.h> - -#include <asm/kvm_emulate.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_mmu.h> - -#include "vgic.h" -#include "vgic-mmio.h" - -static int vgic_its_save_tables_v0(struct vgic_its *its); -static int vgic_its_restore_tables_v0(struct vgic_its *its); -static int vgic_its_commit_v0(struct vgic_its *its); -static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, - struct kvm_vcpu *filter_vcpu, bool needs_inv); - -/* - * Creates a new (reference to a) struct vgic_irq for a given LPI. - * If this LPI is already mapped on another ITS, we increase its refcount - * and return a pointer to the existing structure. - * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq. - * This function returns a pointer to the _unlocked_ structure. - */ -static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, - struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq; - unsigned long flags; - int ret; - - /* In this case there is no put, since we keep the reference. */ - if (irq) - return irq; - - irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); - if (!irq) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&irq->lpi_list); - INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); - - irq->config = VGIC_CONFIG_EDGE; - kref_init(&irq->refcount); - irq->intid = intid; - irq->target_vcpu = vcpu; - irq->group = 1; - - spin_lock_irqsave(&dist->lpi_list_lock, flags); - - /* - * There could be a race with another vgic_add_lpi(), so we need to - * check that we don't add a second list entry with the same LPI. - */ - list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { - if (oldirq->intid != intid) - continue; - - /* Someone was faster with adding this LPI, lets use that. */ - kfree(irq); - irq = oldirq; - - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() on the returned pointer once it's - * finished with the IRQ. - */ - vgic_get_irq_kref(irq); - - goto out_unlock; - } - - list_add_tail(&irq->lpi_list, &dist->lpi_list_head); - dist->lpi_list_count++; - -out_unlock: - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - /* - * We "cache" the configuration table entries in our struct vgic_irq's. - * However we only have those structs for mapped IRQs, so we read in - * the respective config data from memory here upon mapping the LPI. - */ - ret = update_lpi_config(kvm, irq, NULL, false); - if (ret) - return ERR_PTR(ret); - - ret = vgic_v3_lpi_sync_pending_status(kvm, irq); - if (ret) - return ERR_PTR(ret); - - return irq; -} - -struct its_device { - struct list_head dev_list; - - /* the head for the list of ITTEs */ - struct list_head itt_head; - u32 num_eventid_bits; - gpa_t itt_addr; - u32 device_id; -}; - -#define COLLECTION_NOT_MAPPED ((u32)~0) - -struct its_collection { - struct list_head coll_list; - - u32 collection_id; - u32 target_addr; -}; - -#define its_is_collection_mapped(coll) ((coll) && \ - ((coll)->target_addr != COLLECTION_NOT_MAPPED)) - -struct its_ite { - struct list_head ite_list; - - struct vgic_irq *irq; - struct its_collection *collection; - u32 event_id; -}; - -/** - * struct vgic_its_abi - ITS abi ops and settings - * @cte_esz: collection table entry size - * @dte_esz: device table entry size - * @ite_esz: interrupt translation table entry size - * @save tables: save the ITS tables into guest RAM - * @restore_tables: restore the ITS internal structs from tables - * stored in guest RAM - * @commit: initialize the registers which expose the ABI settings, - * especially the entry sizes - */ -struct vgic_its_abi { - int cte_esz; - int dte_esz; - int ite_esz; - int (*save_tables)(struct vgic_its *its); - int (*restore_tables)(struct vgic_its *its); - int (*commit)(struct vgic_its *its); -}; - -#define ABI_0_ESZ 8 -#define ESZ_MAX ABI_0_ESZ - -static const struct vgic_its_abi its_table_abi_versions[] = { - [0] = { - .cte_esz = ABI_0_ESZ, - .dte_esz = ABI_0_ESZ, - .ite_esz = ABI_0_ESZ, - .save_tables = vgic_its_save_tables_v0, - .restore_tables = vgic_its_restore_tables_v0, - .commit = vgic_its_commit_v0, - }, -}; - -#define NR_ITS_ABIS ARRAY_SIZE(its_table_abi_versions) - -inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its) -{ - return &its_table_abi_versions[its->abi_rev]; -} - -static int vgic_its_set_abi(struct vgic_its *its, u32 rev) -{ - const struct vgic_its_abi *abi; - - its->abi_rev = rev; - abi = vgic_its_get_abi(its); - return abi->commit(its); -} - -/* - * Find and returns a device in the device table for an ITS. - * Must be called with the its_lock mutex held. - */ -static struct its_device *find_its_device(struct vgic_its *its, u32 device_id) -{ - struct its_device *device; - - list_for_each_entry(device, &its->device_list, dev_list) - if (device_id == device->device_id) - return device; - - return NULL; -} - -/* - * Find and returns an interrupt translation table entry (ITTE) for a given - * Device ID/Event ID pair on an ITS. - * Must be called with the its_lock mutex held. - */ -static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, - u32 event_id) -{ - struct its_device *device; - struct its_ite *ite; - - device = find_its_device(its, device_id); - if (device == NULL) - return NULL; - - list_for_each_entry(ite, &device->itt_head, ite_list) - if (ite->event_id == event_id) - return ite; - - return NULL; -} - -/* To be used as an iterator this macro misses the enclosing parentheses */ -#define for_each_lpi_its(dev, ite, its) \ - list_for_each_entry(dev, &(its)->device_list, dev_list) \ - list_for_each_entry(ite, &(dev)->itt_head, ite_list) - -#define GIC_LPI_OFFSET 8192 - -#define VITS_TYPER_IDBITS 16 -#define VITS_TYPER_DEVBITS 16 -#define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1) -#define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1) - -/* - * Finds and returns a collection in the ITS collection table. - * Must be called with the its_lock mutex held. - */ -static struct its_collection *find_collection(struct vgic_its *its, int coll_id) -{ - struct its_collection *collection; - - list_for_each_entry(collection, &its->collection_list, coll_list) { - if (coll_id == collection->collection_id) - return collection; - } - - return NULL; -} - -#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) -#define LPI_PROP_PRIORITY(p) ((p) & 0xfc) - -/* - * Reads the configuration data for a given LPI from guest memory and - * updates the fields in struct vgic_irq. - * If filter_vcpu is not NULL, applies only if the IRQ is targeting this - * VCPU. Unconditionally applies if filter_vcpu is NULL. - */ -static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, - struct kvm_vcpu *filter_vcpu, bool needs_inv) -{ - u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); - u8 prop; - int ret; - unsigned long flags; - - ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET, - &prop, 1); - - if (ret) - return ret; - - spin_lock_irqsave(&irq->irq_lock, flags); - - if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { - irq->priority = LPI_PROP_PRIORITY(prop); - irq->enabled = LPI_PROP_ENABLE_BIT(prop); - - if (!irq->hw) { - vgic_queue_irq_unlock(kvm, irq, flags); - return 0; - } - } - - spin_unlock_irqrestore(&irq->irq_lock, flags); - - if (irq->hw) - return its_prop_update_vlpi(irq->host_irq, prop, needs_inv); - - return 0; -} - -/* - * Create a snapshot of the current LPIs targeting @vcpu, so that we can - * enumerate those LPIs without holding any lock. - * Returns their number and puts the kmalloc'ed array into intid_ptr. - */ -int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq; - unsigned long flags; - u32 *intids; - int irq_count, i = 0; - - /* - * There is an obvious race between allocating the array and LPIs - * being mapped/unmapped. If we ended up here as a result of a - * command, we're safe (locks are held, preventing another - * command). If coming from another path (such as enabling LPIs), - * we must be careful not to overrun the array. - */ - irq_count = READ_ONCE(dist->lpi_list_count); - intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); - if (!intids) - return -ENOMEM; - - spin_lock_irqsave(&dist->lpi_list_lock, flags); - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - if (i == irq_count) - break; - /* We don't need to "get" the IRQ, as we hold the list lock. */ - if (vcpu && irq->target_vcpu != vcpu) - continue; - intids[i++] = irq->intid; - } - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - *intid_ptr = intids; - return i; -} - -static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) -{ - int ret = 0; - unsigned long flags; - - spin_lock_irqsave(&irq->irq_lock, flags); - irq->target_vcpu = vcpu; - spin_unlock_irqrestore(&irq->irq_lock, flags); - - if (irq->hw) { - struct its_vlpi_map map; - - ret = its_get_vlpi(irq->host_irq, &map); - if (ret) - return ret; - - map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - - ret = its_map_vlpi(irq->host_irq, &map); - } - - return ret; -} - -/* - * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI - * is targeting) to the VGIC's view, which deals with target VCPUs. - * Needs to be called whenever either the collection for a LPIs has - * changed or the collection itself got retargeted. - */ -static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite) -{ - struct kvm_vcpu *vcpu; - - if (!its_is_collection_mapped(ite->collection)) - return; - - vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); - update_affinity(ite->irq, vcpu); -} - -/* - * Updates the target VCPU for every LPI targeting this collection. - * Must be called with the its_lock mutex held. - */ -static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, - struct its_collection *coll) -{ - struct its_device *device; - struct its_ite *ite; - - for_each_lpi_its(device, ite, its) { - if (!ite->collection || coll != ite->collection) - continue; - - update_affinity_ite(kvm, ite); - } -} - -static u32 max_lpis_propbaser(u64 propbaser) -{ - int nr_idbits = (propbaser & 0x1f) + 1; - - return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS); -} - -/* - * Sync the pending table pending bit of LPIs targeting @vcpu - * with our own data structures. This relies on the LPI being - * mapped before. - */ -static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) -{ - gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); - struct vgic_irq *irq; - int last_byte_offset = -1; - int ret = 0; - u32 *intids; - int nr_irqs, i; - unsigned long flags; - u8 pendmask; - - nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids); - if (nr_irqs < 0) - return nr_irqs; - - for (i = 0; i < nr_irqs; i++) { - int byte_offset, bit_nr; - - byte_offset = intids[i] / BITS_PER_BYTE; - bit_nr = intids[i] % BITS_PER_BYTE; - - /* - * For contiguously allocated LPIs chances are we just read - * this very same byte in the last iteration. Reuse that. - */ - if (byte_offset != last_byte_offset) { - ret = kvm_read_guest_lock(vcpu->kvm, - pendbase + byte_offset, - &pendmask, 1); - if (ret) { - kfree(intids); - return ret; - } - last_byte_offset = byte_offset; - } - - irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); - spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = pendmask & (1U << bit_nr); - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - vgic_put_irq(vcpu->kvm, irq); - } - - kfree(intids); - - return ret; -} - -static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 reg = GITS_TYPER_PLPIS; - - /* - * We use linear CPU numbers for redistributor addressing, - * so GITS_TYPER.PTA is 0. - * Also we force all PROPBASER registers to be the same, so - * CommonLPIAff is 0 as well. - * To avoid memory waste in the guest, we keep the number of IDBits and - * DevBits low - as least for the time being. - */ - reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT; - reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT; - reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT; - - return extract_bytes(reg, addr & 7, len); -} - -static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u32 val; - - val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK; - val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM; - return val; -} - -static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 rev = GITS_IIDR_REV(val); - - if (rev >= NR_ITS_ABIS) - return -EINVAL; - return vgic_its_set_abi(its, rev); -} - -static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - switch (addr & 0xffff) { - case GITS_PIDR0: - return 0x92; /* part number, bits[7:0] */ - case GITS_PIDR1: - return 0xb4; /* part number, bits[11:8] */ - case GITS_PIDR2: - return GIC_PIDR2_ARCH_GICv3 | 0x0b; - case GITS_PIDR4: - return 0x40; /* This is a 64K software visible page */ - /* The following are the ID registers for (any) GIC. */ - case GITS_CIDR0: - return 0x0d; - case GITS_CIDR1: - return 0xf0; - case GITS_CIDR2: - return 0x05; - case GITS_CIDR3: - return 0xb1; - } - - return 0; -} - -int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid, struct vgic_irq **irq) -{ - struct kvm_vcpu *vcpu; - struct its_ite *ite; - - if (!its->enabled) - return -EBUSY; - - ite = find_ite(its, devid, eventid); - if (!ite || !its_is_collection_mapped(ite->collection)) - return E_ITS_INT_UNMAPPED_INTERRUPT; - - vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); - if (!vcpu) - return E_ITS_INT_UNMAPPED_INTERRUPT; - - if (!vcpu->arch.vgic_cpu.lpis_enabled) - return -EBUSY; - - *irq = ite->irq; - return 0; -} - -struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) -{ - u64 address; - struct kvm_io_device *kvm_io_dev; - struct vgic_io_device *iodev; - - if (!vgic_has_its(kvm)) - return ERR_PTR(-ENODEV); - - if (!(msi->flags & KVM_MSI_VALID_DEVID)) - return ERR_PTR(-EINVAL); - - address = (u64)msi->address_hi << 32 | msi->address_lo; - - kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); - if (!kvm_io_dev) - return ERR_PTR(-EINVAL); - - if (kvm_io_dev->ops != &kvm_io_gic_ops) - return ERR_PTR(-EINVAL); - - iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); - if (iodev->iodev_type != IODEV_ITS) - return ERR_PTR(-EINVAL); - - return iodev->its; -} - -/* - * Find the target VCPU and the LPI number for a given devid/eventid pair - * and make this IRQ pending, possibly injecting it. - * Must be called with the its_lock mutex held. - * Returns 0 on success, a positive error value for any ITS mapping - * related errors and negative error values for generic errors. - */ -static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid) -{ - struct vgic_irq *irq = NULL; - unsigned long flags; - int err; - - err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq); - if (err) - return err; - - if (irq->hw) - return irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, true); - - spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - vgic_queue_irq_unlock(kvm, irq, flags); - - return 0; -} - -/* - * Queries the KVM IO bus framework to get the ITS pointer from the given - * doorbell address. - * We then call vgic_its_trigger_msi() with the decoded data. - * According to the KVM_SIGNAL_MSI API description returns 1 on success. - */ -int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) -{ - struct vgic_its *its; - int ret; - - its = vgic_msi_to_its(kvm, msi); - if (IS_ERR(its)) - return PTR_ERR(its); - - mutex_lock(&its->its_lock); - ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data); - mutex_unlock(&its->its_lock); - - if (ret < 0) - return ret; - - /* - * KVM_SIGNAL_MSI demands a return value > 0 for success and 0 - * if the guest has blocked the MSI. So we map any LPI mapping - * related error to that. - */ - if (ret) - return 0; - else - return 1; -} - -/* Requires the its_lock to be held. */ -static void its_free_ite(struct kvm *kvm, struct its_ite *ite) -{ - list_del(&ite->ite_list); - - /* This put matches the get in vgic_add_lpi. */ - if (ite->irq) { - if (ite->irq->hw) - WARN_ON(its_unmap_vlpi(ite->irq->host_irq)); - - vgic_put_irq(kvm, ite->irq); - } - - kfree(ite); -} - -static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size) -{ - return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1); -} - -#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8) -#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32) -#define its_cmd_get_size(cmd) (its_cmd_mask_field(cmd, 1, 0, 5) + 1) -#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32) -#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32) -#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16) -#define its_cmd_get_ittaddr(cmd) (its_cmd_mask_field(cmd, 2, 8, 44) << 8) -#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32) -#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1) - -/* - * The DISCARD command frees an Interrupt Translation Table Entry (ITTE). - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - struct its_ite *ite; - - - ite = find_ite(its, device_id, event_id); - if (ite && ite->collection) { - /* - * Though the spec talks about removing the pending state, we - * don't bother here since we clear the ITTE anyway and the - * pending state is a property of the ITTE struct. - */ - its_free_ite(kvm, ite); - return 0; - } - - return E_ITS_DISCARD_UNMAPPED_INTERRUPT; -} - -/* - * The MOVI command moves an ITTE to a different collection. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - u32 coll_id = its_cmd_get_collection(its_cmd); - struct kvm_vcpu *vcpu; - struct its_ite *ite; - struct its_collection *collection; - - ite = find_ite(its, device_id, event_id); - if (!ite) - return E_ITS_MOVI_UNMAPPED_INTERRUPT; - - if (!its_is_collection_mapped(ite->collection)) - return E_ITS_MOVI_UNMAPPED_COLLECTION; - - collection = find_collection(its, coll_id); - if (!its_is_collection_mapped(collection)) - return E_ITS_MOVI_UNMAPPED_COLLECTION; - - ite->collection = collection; - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - return update_affinity(ite->irq, vcpu); -} - -/* - * Check whether an ID can be stored into the corresponding guest table. - * For a direct table this is pretty easy, but gets a bit nasty for - * indirect tables. We check whether the resulting guest physical address - * is actually valid (covered by a memslot and guest accessible). - * For this we have to read the respective first level entry. - */ -static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, - gpa_t *eaddr) -{ - int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; - u64 indirect_ptr, type = GITS_BASER_TYPE(baser); - phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); - int esz = GITS_BASER_ENTRY_SIZE(baser); - int index; - gfn_t gfn; - - switch (type) { - case GITS_BASER_TYPE_DEVICE: - if (id >= BIT_ULL(VITS_TYPER_DEVBITS)) - return false; - break; - case GITS_BASER_TYPE_COLLECTION: - /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */ - if (id >= BIT_ULL(16)) - return false; - break; - default: - return false; - } - - if (!(baser & GITS_BASER_INDIRECT)) { - phys_addr_t addr; - - if (id >= (l1_tbl_size / esz)) - return false; - - addr = base + id * esz; - gfn = addr >> PAGE_SHIFT; - - if (eaddr) - *eaddr = addr; - return kvm_is_visible_gfn(its->dev->kvm, gfn); - } - - /* calculate and check the index into the 1st level */ - index = id / (SZ_64K / esz); - if (index >= (l1_tbl_size / sizeof(u64))) - return false; - - /* Each 1st level entry is represented by a 64-bit value. */ - if (kvm_read_guest_lock(its->dev->kvm, - base + index * sizeof(indirect_ptr), - &indirect_ptr, sizeof(indirect_ptr))) - return false; - - indirect_ptr = le64_to_cpu(indirect_ptr); - - /* check the valid bit of the first level entry */ - if (!(indirect_ptr & BIT_ULL(63))) - return false; - - /* Mask the guest physical address and calculate the frame number. */ - indirect_ptr &= GENMASK_ULL(51, 16); - - /* Find the address of the actual entry */ - index = id % (SZ_64K / esz); - indirect_ptr += index * esz; - gfn = indirect_ptr >> PAGE_SHIFT; - - if (eaddr) - *eaddr = indirect_ptr; - return kvm_is_visible_gfn(its->dev->kvm, gfn); -} - -static int vgic_its_alloc_collection(struct vgic_its *its, - struct its_collection **colp, - u32 coll_id) -{ - struct its_collection *collection; - - if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) - return E_ITS_MAPC_COLLECTION_OOR; - - collection = kzalloc(sizeof(*collection), GFP_KERNEL); - if (!collection) - return -ENOMEM; - - collection->collection_id = coll_id; - collection->target_addr = COLLECTION_NOT_MAPPED; - - list_add_tail(&collection->coll_list, &its->collection_list); - *colp = collection; - - return 0; -} - -static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) -{ - struct its_collection *collection; - struct its_device *device; - struct its_ite *ite; - - /* - * Clearing the mapping for that collection ID removes the - * entry from the list. If there wasn't any before, we can - * go home early. - */ - collection = find_collection(its, coll_id); - if (!collection) - return; - - for_each_lpi_its(device, ite, its) - if (ite->collection && - ite->collection->collection_id == coll_id) - ite->collection = NULL; - - list_del(&collection->coll_list); - kfree(collection); -} - -/* Must be called with its_lock mutex held */ -static struct its_ite *vgic_its_alloc_ite(struct its_device *device, - struct its_collection *collection, - u32 event_id) -{ - struct its_ite *ite; - - ite = kzalloc(sizeof(*ite), GFP_KERNEL); - if (!ite) - return ERR_PTR(-ENOMEM); - - ite->event_id = event_id; - ite->collection = collection; - - list_add_tail(&ite->ite_list, &device->itt_head); - return ite; -} - -/* - * The MAPTI and MAPI commands map LPIs to ITTEs. - * Must be called with its_lock mutex held. - */ -static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - u32 coll_id = its_cmd_get_collection(its_cmd); - struct its_ite *ite; - struct kvm_vcpu *vcpu = NULL; - struct its_device *device; - struct its_collection *collection, *new_coll = NULL; - struct vgic_irq *irq; - int lpi_nr; - - device = find_its_device(its, device_id); - if (!device) - return E_ITS_MAPTI_UNMAPPED_DEVICE; - - if (event_id >= BIT_ULL(device->num_eventid_bits)) - return E_ITS_MAPTI_ID_OOR; - - if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) - lpi_nr = its_cmd_get_physical_id(its_cmd); - else - lpi_nr = event_id; - if (lpi_nr < GIC_LPI_OFFSET || - lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) - return E_ITS_MAPTI_PHYSICALID_OOR; - - /* If there is an existing mapping, behavior is UNPREDICTABLE. */ - if (find_ite(its, device_id, event_id)) - return 0; - - collection = find_collection(its, coll_id); - if (!collection) { - int ret = vgic_its_alloc_collection(its, &collection, coll_id); - if (ret) - return ret; - new_coll = collection; - } - - ite = vgic_its_alloc_ite(device, collection, event_id); - if (IS_ERR(ite)) { - if (new_coll) - vgic_its_free_collection(its, coll_id); - return PTR_ERR(ite); - } - - if (its_is_collection_mapped(collection)) - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - irq = vgic_add_lpi(kvm, lpi_nr, vcpu); - if (IS_ERR(irq)) { - if (new_coll) - vgic_its_free_collection(its, coll_id); - its_free_ite(kvm, ite); - return PTR_ERR(irq); - } - ite->irq = irq; - - return 0; -} - -/* Requires the its_lock to be held. */ -static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) -{ - struct its_ite *ite, *temp; - - /* - * The spec says that unmapping a device with still valid - * ITTEs associated is UNPREDICTABLE. We remove all ITTEs, - * since we cannot leave the memory unreferenced. - */ - list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list) - its_free_ite(kvm, ite); - - list_del(&device->dev_list); - kfree(device); -} - -/* its lock must be held */ -static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) -{ - struct its_device *cur, *temp; - - list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) - vgic_its_free_device(kvm, cur); -} - -/* its lock must be held */ -static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its) -{ - struct its_collection *cur, *temp; - - list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list) - vgic_its_free_collection(its, cur->collection_id); -} - -/* Must be called with its_lock mutex held */ -static struct its_device *vgic_its_alloc_device(struct vgic_its *its, - u32 device_id, gpa_t itt_addr, - u8 num_eventid_bits) -{ - struct its_device *device; - - device = kzalloc(sizeof(*device), GFP_KERNEL); - if (!device) - return ERR_PTR(-ENOMEM); - - device->device_id = device_id; - device->itt_addr = itt_addr; - device->num_eventid_bits = num_eventid_bits; - INIT_LIST_HEAD(&device->itt_head); - - list_add_tail(&device->dev_list, &its->device_list); - return device; -} - -/* - * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - bool valid = its_cmd_get_validbit(its_cmd); - u8 num_eventid_bits = its_cmd_get_size(its_cmd); - gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); - struct its_device *device; - - if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL)) - return E_ITS_MAPD_DEVICE_OOR; - - if (valid && num_eventid_bits > VITS_TYPER_IDBITS) - return E_ITS_MAPD_ITTSIZE_OOR; - - device = find_its_device(its, device_id); - - /* - * The spec says that calling MAPD on an already mapped device - * invalidates all cached data for this device. We implement this - * by removing the mapping and re-establishing it. - */ - if (device) - vgic_its_free_device(kvm, device); - - /* - * The spec does not say whether unmapping a not-mapped device - * is an error, so we are done in any case. - */ - if (!valid) - return 0; - - device = vgic_its_alloc_device(its, device_id, itt_addr, - num_eventid_bits); - - return PTR_ERR_OR_ZERO(device); -} - -/* - * The MAPC command maps collection IDs to redistributors. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u16 coll_id; - u32 target_addr; - struct its_collection *collection; - bool valid; - - valid = its_cmd_get_validbit(its_cmd); - coll_id = its_cmd_get_collection(its_cmd); - target_addr = its_cmd_get_target_addr(its_cmd); - - if (target_addr >= atomic_read(&kvm->online_vcpus)) - return E_ITS_MAPC_PROCNUM_OOR; - - if (!valid) { - vgic_its_free_collection(its, coll_id); - } else { - collection = find_collection(its, coll_id); - - if (!collection) { - int ret; - - ret = vgic_its_alloc_collection(its, &collection, - coll_id); - if (ret) - return ret; - collection->target_addr = target_addr; - } else { - collection->target_addr = target_addr; - update_affinity_collection(kvm, its, collection); - } - } - - return 0; -} - -/* - * The CLEAR command removes the pending state for a particular LPI. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - struct its_ite *ite; - - - ite = find_ite(its, device_id, event_id); - if (!ite) - return E_ITS_CLEAR_UNMAPPED_INTERRUPT; - - ite->irq->pending_latch = false; - - if (ite->irq->hw) - return irq_set_irqchip_state(ite->irq->host_irq, - IRQCHIP_STATE_PENDING, false); - - return 0; -} - -/* - * The INV command syncs the configuration bits from the memory table. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - struct its_ite *ite; - - - ite = find_ite(its, device_id, event_id); - if (!ite) - return E_ITS_INV_UNMAPPED_INTERRUPT; - - return update_lpi_config(kvm, ite->irq, NULL, true); -} - -/* - * The INVALL command requests flushing of all IRQ data in this collection. - * Find the VCPU mapped to that collection, then iterate over the VM's list - * of mapped LPIs and update the configuration for each IRQ which targets - * the specified vcpu. The configuration will be read from the in-memory - * configuration table. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 coll_id = its_cmd_get_collection(its_cmd); - struct its_collection *collection; - struct kvm_vcpu *vcpu; - struct vgic_irq *irq; - u32 *intids; - int irq_count, i; - - collection = find_collection(its, coll_id); - if (!its_is_collection_mapped(collection)) - return E_ITS_INVALL_UNMAPPED_COLLECTION; - - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); - if (irq_count < 0) - return irq_count; - - for (i = 0; i < irq_count; i++) { - irq = vgic_get_irq(kvm, NULL, intids[i]); - if (!irq) - continue; - update_lpi_config(kvm, irq, vcpu, false); - vgic_put_irq(kvm, irq); - } - - kfree(intids); - - if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) - its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); - - return 0; -} - -/* - * The MOVALL command moves the pending state of all IRQs targeting one - * redistributor to another. We don't hold the pending state in the VCPUs, - * but in the IRQs instead, so there is really not much to do for us here. - * However the spec says that no IRQ must target the old redistributor - * afterwards, so we make sure that no LPI is using the associated target_vcpu. - * This command affects all LPIs in the system that target that redistributor. - */ -static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 target1_addr = its_cmd_get_target_addr(its_cmd); - u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32); - struct kvm_vcpu *vcpu1, *vcpu2; - struct vgic_irq *irq; - u32 *intids; - int irq_count, i; - - if (target1_addr >= atomic_read(&kvm->online_vcpus) || - target2_addr >= atomic_read(&kvm->online_vcpus)) - return E_ITS_MOVALL_PROCNUM_OOR; - - if (target1_addr == target2_addr) - return 0; - - vcpu1 = kvm_get_vcpu(kvm, target1_addr); - vcpu2 = kvm_get_vcpu(kvm, target2_addr); - - irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids); - if (irq_count < 0) - return irq_count; - - for (i = 0; i < irq_count; i++) { - irq = vgic_get_irq(kvm, NULL, intids[i]); - - update_affinity(irq, vcpu2); - - vgic_put_irq(kvm, irq); - } - - kfree(intids); - return 0; -} - -/* - * The INT command injects the LPI associated with that DevID/EvID pair. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 msi_data = its_cmd_get_id(its_cmd); - u64 msi_devid = its_cmd_get_deviceid(its_cmd); - - return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); -} - -/* - * This function is called with the its_cmd lock held, but the ITS data - * structure lock dropped. - */ -static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - int ret = -ENODEV; - - mutex_lock(&its->its_lock); - switch (its_cmd_get_command(its_cmd)) { - case GITS_CMD_MAPD: - ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); - break; - case GITS_CMD_MAPC: - ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); - break; - case GITS_CMD_MAPI: - ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); - break; - case GITS_CMD_MAPTI: - ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); - break; - case GITS_CMD_MOVI: - ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); - break; - case GITS_CMD_DISCARD: - ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd); - break; - case GITS_CMD_CLEAR: - ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd); - break; - case GITS_CMD_MOVALL: - ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); - break; - case GITS_CMD_INT: - ret = vgic_its_cmd_handle_int(kvm, its, its_cmd); - break; - case GITS_CMD_INV: - ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); - break; - case GITS_CMD_INVALL: - ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd); - break; - case GITS_CMD_SYNC: - /* we ignore this command: we are in sync all of the time */ - ret = 0; - break; - } - mutex_unlock(&its->its_lock); - - return ret; -} - -static u64 vgic_sanitise_its_baser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK, - GITS_BASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK, - GITS_BASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK, - GITS_BASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - /* We support only one (ITS) page size: 64K */ - reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; - - return reg; -} - -static u64 vgic_sanitise_its_cbaser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK, - GITS_CBASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK, - GITS_CBASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK, - GITS_CBASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - /* Sanitise the physical address to be 64k aligned. */ - reg &= ~GENMASK_ULL(15, 12); - - return reg; -} - -static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return extract_bytes(its->cbaser, addr & 7, len); -} - -static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - /* When GITS_CTLR.Enable is 1, this register is RO. */ - if (its->enabled) - return; - - mutex_lock(&its->cmd_lock); - its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val); - its->cbaser = vgic_sanitise_its_cbaser(its->cbaser); - its->creadr = 0; - /* - * CWRITER is architecturally UNKNOWN on reset, but we need to reset - * it to CREADR to make sure we start with an empty command buffer. - */ - its->cwriter = its->creadr; - mutex_unlock(&its->cmd_lock); -} - -#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12) -#define ITS_CMD_SIZE 32 -#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) - -/* Must be called with the cmd_lock held. */ -static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) -{ - gpa_t cbaser; - u64 cmd_buf[4]; - - /* Commands are only processed when the ITS is enabled. */ - if (!its->enabled) - return; - - cbaser = GITS_CBASER_ADDRESS(its->cbaser); - - while (its->cwriter != its->creadr) { - int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, - cmd_buf, ITS_CMD_SIZE); - /* - * If kvm_read_guest() fails, this could be due to the guest - * programming a bogus value in CBASER or something else going - * wrong from which we cannot easily recover. - * According to section 6.3.2 in the GICv3 spec we can just - * ignore that command then. - */ - if (!ret) - vgic_its_handle_command(kvm, its, cmd_buf); - - its->creadr += ITS_CMD_SIZE; - if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) - its->creadr = 0; - } -} - -/* - * By writing to CWRITER the guest announces new commands to be processed. - * To avoid any races in the first place, we take the its_cmd lock, which - * protects our ring buffer variables, so that there is only one user - * per ITS handling commands at a given time. - */ -static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u64 reg; - - if (!its) - return; - - mutex_lock(&its->cmd_lock); - - reg = update_64bit_reg(its->cwriter, addr & 7, len, val); - reg = ITS_CMD_OFFSET(reg); - if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { - mutex_unlock(&its->cmd_lock); - return; - } - its->cwriter = reg; - - vgic_its_process_commands(kvm, its); - - mutex_unlock(&its->cmd_lock); -} - -static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return extract_bytes(its->cwriter, addr & 0x7, len); -} - -static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return extract_bytes(its->creadr, addr & 0x7, len); -} - -static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 cmd_offset; - int ret = 0; - - mutex_lock(&its->cmd_lock); - - if (its->enabled) { - ret = -EBUSY; - goto out; - } - - cmd_offset = ITS_CMD_OFFSET(val); - if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { - ret = -EINVAL; - goto out; - } - - its->creadr = cmd_offset; -out: - mutex_unlock(&its->cmd_lock); - return ret; -} - -#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7) -static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u64 reg; - - switch (BASER_INDEX(addr)) { - case 0: - reg = its->baser_device_table; - break; - case 1: - reg = its->baser_coll_table; - break; - default: - reg = 0; - break; - } - - return extract_bytes(reg, addr & 7, len); -} - -#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) -static void vgic_mmio_write_its_baser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 entry_size, table_type; - u64 reg, *regptr, clearbits = 0; - - /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ - if (its->enabled) - return; - - switch (BASER_INDEX(addr)) { - case 0: - regptr = &its->baser_device_table; - entry_size = abi->dte_esz; - table_type = GITS_BASER_TYPE_DEVICE; - break; - case 1: - regptr = &its->baser_coll_table; - entry_size = abi->cte_esz; - table_type = GITS_BASER_TYPE_COLLECTION; - clearbits = GITS_BASER_INDIRECT; - break; - default: - return; - } - - reg = update_64bit_reg(*regptr, addr & 7, len, val); - reg &= ~GITS_BASER_RO_MASK; - reg &= ~clearbits; - - reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; - reg |= table_type << GITS_BASER_TYPE_SHIFT; - reg = vgic_sanitise_its_baser(reg); - - *regptr = reg; - - if (!(reg & GITS_BASER_VALID)) { - /* Take the its_lock to prevent a race with a save/restore */ - mutex_lock(&its->its_lock); - switch (table_type) { - case GITS_BASER_TYPE_DEVICE: - vgic_its_free_device_list(kvm, its); - break; - case GITS_BASER_TYPE_COLLECTION: - vgic_its_free_collection_list(kvm, its); - break; - } - mutex_unlock(&its->its_lock); - } -} - -static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u32 reg = 0; - - mutex_lock(&its->cmd_lock); - if (its->creadr == its->cwriter) - reg |= GITS_CTLR_QUIESCENT; - if (its->enabled) - reg |= GITS_CTLR_ENABLE; - mutex_unlock(&its->cmd_lock); - - return reg; -} - -static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - mutex_lock(&its->cmd_lock); - - /* - * It is UNPREDICTABLE to enable the ITS if any of the CBASER or - * device/collection BASER are invalid - */ - if (!its->enabled && (val & GITS_CTLR_ENABLE) && - (!(its->baser_device_table & GITS_BASER_VALID) || - !(its->baser_coll_table & GITS_BASER_VALID) || - !(its->cbaser & GITS_CBASER_VALID))) - goto out; - - its->enabled = !!(val & GITS_CTLR_ENABLE); - - /* - * Try to process any pending commands. This function bails out early - * if the ITS is disabled or no commands have been queued. - */ - vgic_its_process_commands(kvm, its); - -out: - mutex_unlock(&its->cmd_lock); -} - -#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ -{ \ - .reg_offset = off, \ - .len = length, \ - .access_flags = acc, \ - .its_read = rd, \ - .its_write = wr, \ -} - -#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\ -{ \ - .reg_offset = off, \ - .len = length, \ - .access_flags = acc, \ - .its_read = rd, \ - .its_write = wr, \ - .uaccess_its_write = uwr, \ -} - -static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, unsigned long val) -{ - /* Ignore */ -} - -static struct vgic_register_region its_registers[] = { - REGISTER_ITS_DESC(GITS_CTLR, - vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4, - VGIC_ACCESS_32bit), - REGISTER_ITS_DESC_UACCESS(GITS_IIDR, - vgic_mmio_read_its_iidr, its_mmio_write_wi, - vgic_mmio_uaccess_write_its_iidr, 4, - VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_TYPER, - vgic_mmio_read_its_typer, its_mmio_write_wi, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_CBASER, - vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_CWRITER, - vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC_UACCESS(GITS_CREADR, - vgic_mmio_read_its_creadr, its_mmio_write_wi, - vgic_mmio_uaccess_write_its_creadr, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_BASER, - vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_IDREGS_BASE, - vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30, - VGIC_ACCESS_32bit), -}; - -/* This is called on setting the LPI enable bit in the redistributor. */ -void vgic_enable_lpis(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ)) - its_sync_lpi_pending_table(vcpu); -} - -static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its, - u64 addr) -{ - struct vgic_io_device *iodev = &its->iodev; - int ret; - - mutex_lock(&kvm->slots_lock); - if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { - ret = -EBUSY; - goto out; - } - - its->vgic_its_base = addr; - iodev->regions = its_registers; - iodev->nr_regions = ARRAY_SIZE(its_registers); - kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops); - - iodev->base_addr = its->vgic_its_base; - iodev->iodev_type = IODEV_ITS; - iodev->its = its; - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr, - KVM_VGIC_V3_ITS_SIZE, &iodev->dev); -out: - mutex_unlock(&kvm->slots_lock); - - return ret; -} - -#define INITIAL_BASER_VALUE \ - (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ - GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ - GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \ - GITS_BASER_PAGE_SIZE_64K) - -#define INITIAL_PROPBASER_VALUE \ - (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \ - GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \ - GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)) - -static int vgic_its_create(struct kvm_device *dev, u32 type) -{ - struct vgic_its *its; - - if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) - return -ENODEV; - - its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); - if (!its) - return -ENOMEM; - - if (vgic_initialized(dev->kvm)) { - int ret = vgic_v4_init(dev->kvm); - if (ret < 0) { - kfree(its); - return ret; - } - } - - mutex_init(&its->its_lock); - mutex_init(&its->cmd_lock); - - its->vgic_its_base = VGIC_ADDR_UNDEF; - - INIT_LIST_HEAD(&its->device_list); - INIT_LIST_HEAD(&its->collection_list); - - dev->kvm->arch.vgic.msis_require_devid = true; - dev->kvm->arch.vgic.has_its = true; - its->enabled = false; - its->dev = dev; - - its->baser_device_table = INITIAL_BASER_VALUE | - ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); - its->baser_coll_table = INITIAL_BASER_VALUE | - ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); - dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; - - dev->private = its; - - return vgic_its_set_abi(its, NR_ITS_ABIS - 1); -} - -static void vgic_its_destroy(struct kvm_device *kvm_dev) -{ - struct kvm *kvm = kvm_dev->kvm; - struct vgic_its *its = kvm_dev->private; - - mutex_lock(&its->its_lock); - - vgic_its_free_device_list(kvm, its); - vgic_its_free_collection_list(kvm, its); - - mutex_unlock(&its->its_lock); - kfree(its); -} - -int vgic_its_has_attr_regs(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - const struct vgic_register_region *region; - gpa_t offset = attr->attr; - int align; - - align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7; - - if (offset & align) - return -EINVAL; - - region = vgic_find_mmio_region(its_registers, - ARRAY_SIZE(its_registers), - offset); - if (!region) - return -ENXIO; - - return 0; -} - -int vgic_its_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u64 *reg, bool is_write) -{ - const struct vgic_register_region *region; - struct vgic_its *its; - gpa_t addr, offset; - unsigned int len; - int align, ret = 0; - - its = dev->private; - offset = attr->attr; - - /* - * Although the spec supports upper/lower 32-bit accesses to - * 64-bit ITS registers, the userspace ABI requires 64-bit - * accesses to all 64-bit wide registers. We therefore only - * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID - * registers - */ - if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4)) - align = 0x3; - else - align = 0x7; - - if (offset & align) - return -EINVAL; - - mutex_lock(&dev->kvm->lock); - - if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { - ret = -ENXIO; - goto out; - } - - region = vgic_find_mmio_region(its_registers, - ARRAY_SIZE(its_registers), - offset); - if (!region) { - ret = -ENXIO; - goto out; - } - - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - - addr = its->vgic_its_base + offset; - - len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4; - - if (is_write) { - if (region->uaccess_its_write) - ret = region->uaccess_its_write(dev->kvm, its, addr, - len, *reg); - else - region->its_write(dev->kvm, its, addr, len, *reg); - } else { - *reg = region->its_read(dev->kvm, its, addr, len); - } - unlock_all_vcpus(dev->kvm); -out: - mutex_unlock(&dev->kvm->lock); - return ret; -} - -static u32 compute_next_devid_offset(struct list_head *h, - struct its_device *dev) -{ - struct its_device *next; - u32 next_offset; - - if (list_is_last(&dev->dev_list, h)) - return 0; - next = list_next_entry(dev, dev_list); - next_offset = next->device_id - dev->device_id; - - return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET); -} - -static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite) -{ - struct its_ite *next; - u32 next_offset; - - if (list_is_last(&ite->ite_list, h)) - return 0; - next = list_next_entry(ite, ite_list); - next_offset = next->event_id - ite->event_id; - - return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET); -} - -/** - * entry_fn_t - Callback called on a table entry restore path - * @its: its handle - * @id: id of the entry - * @entry: pointer to the entry - * @opaque: pointer to an opaque data - * - * Return: < 0 on error, 0 if last element was identified, id offset to next - * element otherwise - */ -typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, - void *opaque); - -/** - * scan_its_table - Scan a contiguous table in guest RAM and applies a function - * to each entry - * - * @its: its handle - * @base: base gpa of the table - * @size: size of the table in bytes - * @esz: entry size in bytes - * @start_id: the ID of the first entry in the table - * (non zero for 2d level tables) - * @fn: function to apply on each entry - * - * Return: < 0 on error, 0 if last element was identified, 1 otherwise - * (the last element may not be found on second level tables) - */ -static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, - int start_id, entry_fn_t fn, void *opaque) -{ - struct kvm *kvm = its->dev->kvm; - unsigned long len = size; - int id = start_id; - gpa_t gpa = base; - char entry[ESZ_MAX]; - int ret; - - memset(entry, 0, esz); - - while (len > 0) { - int next_offset; - size_t byte_offset; - - ret = kvm_read_guest_lock(kvm, gpa, entry, esz); - if (ret) - return ret; - - next_offset = fn(its, id, entry, opaque); - if (next_offset <= 0) - return next_offset; - - byte_offset = next_offset * esz; - id += next_offset; - gpa += byte_offset; - len -= byte_offset; - } - return 1; -} - -/** - * vgic_its_save_ite - Save an interrupt translation entry at @gpa - */ -static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, - struct its_ite *ite, gpa_t gpa, int ite_esz) -{ - struct kvm *kvm = its->dev->kvm; - u32 next_offset; - u64 val; - - next_offset = compute_next_eventid_offset(&dev->itt_head, ite); - val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) | - ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | - ite->collection->collection_id; - val = cpu_to_le64(val); - return kvm_write_guest(kvm, gpa, &val, ite_esz); -} - -/** - * vgic_its_restore_ite - restore an interrupt translation entry - * @event_id: id used for indexing - * @ptr: pointer to the ITE entry - * @opaque: pointer to the its_device - */ -static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, - void *ptr, void *opaque) -{ - struct its_device *dev = (struct its_device *)opaque; - struct its_collection *collection; - struct kvm *kvm = its->dev->kvm; - struct kvm_vcpu *vcpu = NULL; - u64 val; - u64 *p = (u64 *)ptr; - struct vgic_irq *irq; - u32 coll_id, lpi_id; - struct its_ite *ite; - u32 offset; - - val = *p; - - val = le64_to_cpu(val); - - coll_id = val & KVM_ITS_ITE_ICID_MASK; - lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT; - - if (!lpi_id) - return 1; /* invalid entry, no choice but to scan next entry */ - - if (lpi_id < VGIC_MIN_LPI) - return -EINVAL; - - offset = val >> KVM_ITS_ITE_NEXT_SHIFT; - if (event_id + offset >= BIT_ULL(dev->num_eventid_bits)) - return -EINVAL; - - collection = find_collection(its, coll_id); - if (!collection) - return -EINVAL; - - ite = vgic_its_alloc_ite(dev, collection, event_id); - if (IS_ERR(ite)) - return PTR_ERR(ite); - - if (its_is_collection_mapped(collection)) - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - irq = vgic_add_lpi(kvm, lpi_id, vcpu); - if (IS_ERR(irq)) - return PTR_ERR(irq); - ite->irq = irq; - - return offset; -} - -static int vgic_its_ite_cmp(void *priv, struct list_head *a, - struct list_head *b) -{ - struct its_ite *itea = container_of(a, struct its_ite, ite_list); - struct its_ite *iteb = container_of(b, struct its_ite, ite_list); - - if (itea->event_id < iteb->event_id) - return -1; - else - return 1; -} - -static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - gpa_t base = device->itt_addr; - struct its_ite *ite; - int ret; - int ite_esz = abi->ite_esz; - - list_sort(NULL, &device->itt_head, vgic_its_ite_cmp); - - list_for_each_entry(ite, &device->itt_head, ite_list) { - gpa_t gpa = base + ite->event_id * ite_esz; - - /* - * If an LPI carries the HW bit, this means that this - * interrupt is controlled by GICv4, and we do not - * have direct access to that state. Let's simply fail - * the save operation... - */ - if (ite->irq->hw) - return -EACCES; - - ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz); - if (ret) - return ret; - } - return 0; -} - -/** - * vgic_its_restore_itt - restore the ITT of a device - * - * @its: its handle - * @dev: device handle - * - * Return 0 on success, < 0 on error - */ -static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - gpa_t base = dev->itt_addr; - int ret; - int ite_esz = abi->ite_esz; - size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz; - - ret = scan_its_table(its, base, max_size, ite_esz, 0, - vgic_its_restore_ite, dev); - - /* scan_its_table returns +1 if all ITEs are invalid */ - if (ret > 0) - ret = 0; - - return ret; -} - -/** - * vgic_its_save_dte - Save a device table entry at a given GPA - * - * @its: ITS handle - * @dev: ITS device - * @ptr: GPA - */ -static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, - gpa_t ptr, int dte_esz) -{ - struct kvm *kvm = its->dev->kvm; - u64 val, itt_addr_field; - u32 next_offset; - - itt_addr_field = dev->itt_addr >> 8; - next_offset = compute_next_devid_offset(&its->device_list, dev); - val = (1ULL << KVM_ITS_DTE_VALID_SHIFT | - ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) | - (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | - (dev->num_eventid_bits - 1)); - val = cpu_to_le64(val); - return kvm_write_guest(kvm, ptr, &val, dte_esz); -} - -/** - * vgic_its_restore_dte - restore a device table entry - * - * @its: its handle - * @id: device id the DTE corresponds to - * @ptr: kernel VA where the 8 byte DTE is located - * @opaque: unused - * - * Return: < 0 on error, 0 if the dte is the last one, id offset to the - * next dte otherwise - */ -static int vgic_its_restore_dte(struct vgic_its *its, u32 id, - void *ptr, void *opaque) -{ - struct its_device *dev; - gpa_t itt_addr; - u8 num_eventid_bits; - u64 entry = *(u64 *)ptr; - bool valid; - u32 offset; - int ret; - - entry = le64_to_cpu(entry); - - valid = entry >> KVM_ITS_DTE_VALID_SHIFT; - num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1; - itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK) - >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8; - - if (!valid) - return 1; - - /* dte entry is valid */ - offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT; - - dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits); - if (IS_ERR(dev)) - return PTR_ERR(dev); - - ret = vgic_its_restore_itt(its, dev); - if (ret) { - vgic_its_free_device(its->dev->kvm, dev); - return ret; - } - - return offset; -} - -static int vgic_its_device_cmp(void *priv, struct list_head *a, - struct list_head *b) -{ - struct its_device *deva = container_of(a, struct its_device, dev_list); - struct its_device *devb = container_of(b, struct its_device, dev_list); - - if (deva->device_id < devb->device_id) - return -1; - else - return 1; -} - -/** - * vgic_its_save_device_tables - Save the device table and all ITT - * into guest RAM - * - * L1/L2 handling is hidden by vgic_its_check_id() helper which directly - * returns the GPA of the device entry - */ -static int vgic_its_save_device_tables(struct vgic_its *its) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 baser = its->baser_device_table; - struct its_device *dev; - int dte_esz = abi->dte_esz; - - if (!(baser & GITS_BASER_VALID)) - return 0; - - list_sort(NULL, &its->device_list, vgic_its_device_cmp); - - list_for_each_entry(dev, &its->device_list, dev_list) { - int ret; - gpa_t eaddr; - - if (!vgic_its_check_id(its, baser, - dev->device_id, &eaddr)) - return -EINVAL; - - ret = vgic_its_save_itt(its, dev); - if (ret) - return ret; - - ret = vgic_its_save_dte(its, dev, eaddr, dte_esz); - if (ret) - return ret; - } - return 0; -} - -/** - * handle_l1_dte - callback used for L1 device table entries (2 stage case) - * - * @its: its handle - * @id: index of the entry in the L1 table - * @addr: kernel VA - * @opaque: unused - * - * L1 table entries are scanned by steps of 1 entry - * Return < 0 if error, 0 if last dte was found when scanning the L2 - * table, +1 otherwise (meaning next L1 entry must be scanned) - */ -static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, - void *opaque) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - int l2_start_id = id * (SZ_64K / abi->dte_esz); - u64 entry = *(u64 *)addr; - int dte_esz = abi->dte_esz; - gpa_t gpa; - int ret; - - entry = le64_to_cpu(entry); - - if (!(entry & KVM_ITS_L1E_VALID_MASK)) - return 1; - - gpa = entry & KVM_ITS_L1E_ADDR_MASK; - - ret = scan_its_table(its, gpa, SZ_64K, dte_esz, - l2_start_id, vgic_its_restore_dte, NULL); - - return ret; -} - -/** - * vgic_its_restore_device_tables - Restore the device table and all ITT - * from guest RAM to internal data structs - */ -static int vgic_its_restore_device_tables(struct vgic_its *its) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 baser = its->baser_device_table; - int l1_esz, ret; - int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; - gpa_t l1_gpa; - - if (!(baser & GITS_BASER_VALID)) - return 0; - - l1_gpa = GITS_BASER_ADDR_48_to_52(baser); - - if (baser & GITS_BASER_INDIRECT) { - l1_esz = GITS_LVL1_ENTRY_SIZE; - ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, - handle_l1_dte, NULL); - } else { - l1_esz = abi->dte_esz; - ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, - vgic_its_restore_dte, NULL); - } - - /* scan_its_table returns +1 if all entries are invalid */ - if (ret > 0) - ret = 0; - - return ret; -} - -static int vgic_its_save_cte(struct vgic_its *its, - struct its_collection *collection, - gpa_t gpa, int esz) -{ - u64 val; - - val = (1ULL << KVM_ITS_CTE_VALID_SHIFT | - ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | - collection->collection_id); - val = cpu_to_le64(val); - return kvm_write_guest(its->dev->kvm, gpa, &val, esz); -} - -static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) -{ - struct its_collection *collection; - struct kvm *kvm = its->dev->kvm; - u32 target_addr, coll_id; - u64 val; - int ret; - - BUG_ON(esz > sizeof(val)); - ret = kvm_read_guest_lock(kvm, gpa, &val, esz); - if (ret) - return ret; - val = le64_to_cpu(val); - if (!(val & KVM_ITS_CTE_VALID_MASK)) - return 0; - - target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT); - coll_id = val & KVM_ITS_CTE_ICID_MASK; - - if (target_addr >= atomic_read(&kvm->online_vcpus)) - return -EINVAL; - - collection = find_collection(its, coll_id); - if (collection) - return -EEXIST; - ret = vgic_its_alloc_collection(its, &collection, coll_id); - if (ret) - return ret; - collection->target_addr = target_addr; - return 1; -} - -/** - * vgic_its_save_collection_table - Save the collection table into - * guest RAM - */ -static int vgic_its_save_collection_table(struct vgic_its *its) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 baser = its->baser_coll_table; - gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); - struct its_collection *collection; - u64 val; - size_t max_size, filled = 0; - int ret, cte_esz = abi->cte_esz; - - if (!(baser & GITS_BASER_VALID)) - return 0; - - max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; - - list_for_each_entry(collection, &its->collection_list, coll_list) { - ret = vgic_its_save_cte(its, collection, gpa, cte_esz); - if (ret) - return ret; - gpa += cte_esz; - filled += cte_esz; - } - - if (filled == max_size) - return 0; - - /* - * table is not fully filled, add a last dummy element - * with valid bit unset - */ - val = 0; - BUG_ON(cte_esz > sizeof(val)); - ret = kvm_write_guest(its->dev->kvm, gpa, &val, cte_esz); - return ret; -} - -/** - * vgic_its_restore_collection_table - reads the collection table - * in guest memory and restores the ITS internal state. Requires the - * BASER registers to be restored before. - */ -static int vgic_its_restore_collection_table(struct vgic_its *its) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 baser = its->baser_coll_table; - int cte_esz = abi->cte_esz; - size_t max_size, read = 0; - gpa_t gpa; - int ret; - - if (!(baser & GITS_BASER_VALID)) - return 0; - - gpa = GITS_BASER_ADDR_48_to_52(baser); - - max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; - - while (read < max_size) { - ret = vgic_its_restore_cte(its, gpa, cte_esz); - if (ret <= 0) - break; - gpa += cte_esz; - read += cte_esz; - } - - if (ret > 0) - return 0; - - return ret; -} - -/** - * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM - * according to v0 ABI - */ -static int vgic_its_save_tables_v0(struct vgic_its *its) -{ - int ret; - - ret = vgic_its_save_device_tables(its); - if (ret) - return ret; - - return vgic_its_save_collection_table(its); -} - -/** - * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM - * to internal data structs according to V0 ABI - * - */ -static int vgic_its_restore_tables_v0(struct vgic_its *its) -{ - int ret; - - ret = vgic_its_restore_collection_table(its); - if (ret) - return ret; - - return vgic_its_restore_device_tables(its); -} - -static int vgic_its_commit_v0(struct vgic_its *its) -{ - const struct vgic_its_abi *abi; - - abi = vgic_its_get_abi(its); - its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK; - its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK; - - its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5) - << GITS_BASER_ENTRY_SIZE_SHIFT); - - its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5) - << GITS_BASER_ENTRY_SIZE_SHIFT); - return 0; -} - -static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its) -{ - /* We need to keep the ABI specific field values */ - its->baser_coll_table &= ~GITS_BASER_VALID; - its->baser_device_table &= ~GITS_BASER_VALID; - its->cbaser = 0; - its->creadr = 0; - its->cwriter = 0; - its->enabled = 0; - vgic_its_free_device_list(kvm, its); - vgic_its_free_collection_list(kvm, its); -} - -static int vgic_its_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_ITS_ADDR_TYPE: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - case KVM_DEV_ARM_ITS_CTRL_RESET: - return 0; - case KVM_DEV_ARM_ITS_SAVE_TABLES: - return 0; - case KVM_DEV_ARM_ITS_RESTORE_TABLES: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: - return vgic_its_has_attr_regs(dev, attr); - } - return -ENXIO; -} - -static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) -{ - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - int ret = 0; - - if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ - return 0; - - mutex_lock(&kvm->lock); - mutex_lock(&its->its_lock); - - if (!lock_all_vcpus(kvm)) { - mutex_unlock(&its->its_lock); - mutex_unlock(&kvm->lock); - return -EBUSY; - } - - switch (attr) { - case KVM_DEV_ARM_ITS_CTRL_RESET: - vgic_its_reset(kvm, its); - break; - case KVM_DEV_ARM_ITS_SAVE_TABLES: - ret = abi->save_tables(its); - break; - case KVM_DEV_ARM_ITS_RESTORE_TABLES: - ret = abi->restore_tables(its); - break; - } - - unlock_all_vcpus(kvm); - mutex_unlock(&its->its_lock); - mutex_unlock(&kvm->lock); - return ret; -} - -static int vgic_its_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - struct vgic_its *its = dev->private; - int ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - unsigned long type = (unsigned long)attr->attr; - u64 addr; - - if (type != KVM_VGIC_ITS_ADDR_TYPE) - return -ENODEV; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, - addr, SZ_64K); - if (ret) - return ret; - - return vgic_register_its_iodev(dev->kvm, its, addr); - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: - return vgic_its_ctrl(dev->kvm, its, attr->attr); - case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_its_attr_regs_access(dev, attr, ®, true); - } - } - return -ENXIO; -} - -static int vgic_its_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - struct vgic_its *its = dev->private; - u64 addr = its->vgic_its_base; - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - unsigned long type = (unsigned long)attr->attr; - - if (type != KVM_VGIC_ITS_ADDR_TYPE) - return -ENODEV; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - } - case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - int ret; - - ret = vgic_its_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - default: - return -ENXIO; - } - - return 0; -} - -static struct kvm_device_ops kvm_arm_vgic_its_ops = { - .name = "kvm-arm-vgic-its", - .create = vgic_its_create, - .destroy = vgic_its_destroy, - .set_attr = vgic_its_set_attr, - .get_attr = vgic_its_get_attr, - .has_attr = vgic_its_has_attr, -}; - -int kvm_vgic_register_its_device(void) -{ - return kvm_register_device_ops(&kvm_arm_vgic_its_ops, - KVM_DEV_TYPE_ARM_VGIC_ITS); -} diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c deleted file mode 100644 index 114dce9f4bf5..000000000000 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ /dev/null @@ -1,749 +0,0 @@ -/* - * VGIC: KVM DEVICE API - * - * Copyright (C) 2015 ARM Ltd. - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ -#include <linux/kvm_host.h> -#include <kvm/arm_vgic.h> -#include <linux/uaccess.h> -#include <asm/kvm_mmu.h> -#include <asm/cputype.h> -#include "vgic.h" - -/* common helpers */ - -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment) -{ - if (addr & ~kvm_phys_mask(kvm)) - return -E2BIG; - - if (!IS_ALIGNED(addr, alignment)) - return -EINVAL; - - if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) - return -EEXIST; - - return 0; -} - -static int vgic_check_type(struct kvm *kvm, int type_needed) -{ - if (kvm->arch.vgic.vgic_model != type_needed) - return -ENODEV; - else - return 0; -} - -/** - * kvm_vgic_addr - set or get vgic VM base addresses - * @kvm: pointer to the vm struct - * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX - * @addr: pointer to address value - * @write: if true set the address in the VM address space, if false read the - * address - * - * Set or get the vgic base addresses for the distributor and the virtual CPU - * interface in the VM physical address space. These addresses are properties - * of the emulated core/SoC and therefore user space initially knows this - * information. - * Check them for sanity (alignment, double assignment). We can't check for - * overlapping regions in case of a virtual GICv3 here, since we don't know - * the number of VCPUs yet, so we defer this check to map_resources(). - */ -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) -{ - int r = 0; - struct vgic_dist *vgic = &kvm->arch.vgic; - phys_addr_t *addr_ptr, alignment; - u64 undef_value = VGIC_ADDR_UNDEF; - - mutex_lock(&kvm->lock); - switch (type) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); - addr_ptr = &vgic->vgic_dist_base; - alignment = SZ_4K; - break; - case KVM_VGIC_V2_ADDR_TYPE_CPU: - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); - addr_ptr = &vgic->vgic_cpu_base; - alignment = SZ_4K; - break; - case KVM_VGIC_V3_ADDR_TYPE_DIST: - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); - addr_ptr = &vgic->vgic_dist_base; - alignment = SZ_64K; - break; - case KVM_VGIC_V3_ADDR_TYPE_REDIST: { - struct vgic_redist_region *rdreg; - - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); - if (r) - break; - if (write) { - r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); - goto out; - } - rdreg = list_first_entry(&vgic->rd_regions, - struct vgic_redist_region, list); - if (!rdreg) - addr_ptr = &undef_value; - else - addr_ptr = &rdreg->base; - break; - } - case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: - { - struct vgic_redist_region *rdreg; - u8 index; - - r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); - if (r) - break; - - index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK; - - if (write) { - gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK; - u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK) - >> KVM_VGIC_V3_RDIST_COUNT_SHIFT; - u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK) - >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT; - - if (!count || flags) - r = -EINVAL; - else - r = vgic_v3_set_redist_base(kvm, index, - base, count); - goto out; - } - - rdreg = vgic_v3_rdist_region_from_index(kvm, index); - if (!rdreg) { - r = -ENOENT; - goto out; - } - - *addr = index; - *addr |= rdreg->base; - *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; - goto out; - } - default: - r = -ENODEV; - } - - if (r) - goto out; - - if (write) { - r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment); - if (!r) - *addr_ptr = *addr; - } else { - *addr = *addr_ptr; - } - -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int vgic_set_common_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int r; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, true); - return (r == -ENODEV) ? -ENXIO : r; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 val; - int ret = 0; - - if (get_user(val, uaddr)) - return -EFAULT; - - /* - * We require: - * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs - * - at most 1024 interrupts - * - a multiple of 32 interrupts - */ - if (val < (VGIC_NR_PRIVATE_IRQS + 32) || - val > VGIC_MAX_RESERVED || - (val & 31)) - return -EINVAL; - - mutex_lock(&dev->kvm->lock); - - if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) - ret = -EBUSY; - else - dev->kvm->arch.vgic.nr_spis = - val - VGIC_NR_PRIVATE_IRQS; - - mutex_unlock(&dev->kvm->lock); - - return ret; - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - mutex_lock(&dev->kvm->lock); - r = vgic_init(dev->kvm); - mutex_unlock(&dev->kvm->lock); - return r; - } - break; - } - } - - return -ENXIO; -} - -static int vgic_get_common_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int r = -ENXIO; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - r = kvm_vgic_addr(dev->kvm, type, &addr, false); - if (r) - return (r == -ENODEV) ? -ENXIO : r; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - - r = put_user(dev->kvm->arch.vgic.nr_spis + - VGIC_NR_PRIVATE_IRQS, uaddr); - break; - } - } - - return r; -} - -static int vgic_create(struct kvm_device *dev, u32 type) -{ - return kvm_vgic_create(dev->kvm, type); -} - -static void vgic_destroy(struct kvm_device *dev) -{ - kfree(dev); -} - -int kvm_register_vgic_device(unsigned long type) -{ - int ret = -ENODEV; - - switch (type) { - case KVM_DEV_TYPE_ARM_VGIC_V2: - ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, - KVM_DEV_TYPE_ARM_VGIC_V2); - break; - case KVM_DEV_TYPE_ARM_VGIC_V3: - ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, - KVM_DEV_TYPE_ARM_VGIC_V3); - - if (ret) - break; - ret = kvm_vgic_register_its_device(); - break; - } - - return ret; -} - -int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr) -{ - int cpuid; - - cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> - KVM_DEV_ARM_VGIC_CPUID_SHIFT; - - if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) - return -EINVAL; - - reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid); - reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - - return 0; -} - -/* unlocks vcpus from @vcpu_lock_idx and smaller */ -static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) -{ - struct kvm_vcpu *tmp_vcpu; - - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); - mutex_unlock(&tmp_vcpu->mutex); - } -} - -void unlock_all_vcpus(struct kvm *kvm) -{ - unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); -} - -/* Returns true if all vcpus were locked, false otherwise */ -bool lock_all_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *tmp_vcpu; - int c; - - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run and fiddle with the vgic state while we - * access it. - */ - kvm_for_each_vcpu(c, tmp_vcpu, kvm) { - if (!mutex_trylock(&tmp_vcpu->mutex)) { - unlock_vcpus(kvm, c - 1); - return false; - } - } - - return true; -} - -/** - * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state - * - * @dev: kvm device handle - * @attr: kvm device attribute - * @reg: address the value is read or written - * @is_write: true if userspace is writing a register - */ -static int vgic_v2_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u32 *reg, bool is_write) -{ - struct vgic_reg_attr reg_attr; - gpa_t addr; - struct kvm_vcpu *vcpu; - int ret; - - ret = vgic_v2_parse_attr(dev, attr, ®_attr); - if (ret) - return ret; - - vcpu = reg_attr.vcpu; - addr = reg_attr.addr; - - mutex_lock(&dev->kvm->lock); - - ret = vgic_init(dev->kvm); - if (ret) - goto out; - - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg); - break; - default: - ret = -EINVAL; - break; - } - - unlock_all_vcpus(dev->kvm); -out: - mutex_unlock(&dev->kvm->lock); - return ret; -} - -static int vgic_v2_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v2_attr_regs_access(dev, attr, ®, true); - } - } - - return -ENXIO; -} - -static int vgic_v2_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg = 0; - - ret = vgic_v2_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - } - - return -ENXIO; -} - -static int vgic_v2_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - case KVM_VGIC_V2_ADDR_TYPE_CPU: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - return vgic_v2_has_attr_regs(dev, attr); - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v2_ops = { - .name = "kvm-arm-vgic-v2", - .create = vgic_create, - .destroy = vgic_destroy, - .set_attr = vgic_v2_set_attr, - .get_attr = vgic_v2_get_attr, - .has_attr = vgic_v2_has_attr, -}; - -int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr) -{ - unsigned long vgic_mpidr, mpidr_reg; - - /* - * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group, - * attr might not hold MPIDR. Hence assume vcpu0. - */ - if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) { - vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >> - KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT; - - mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr); - reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg); - } else { - reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0); - } - - if (!reg_attr->vcpu) - return -EINVAL; - - reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - - return 0; -} - -/* - * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state - * - * @dev: kvm device handle - * @attr: kvm device attribute - * @reg: address the value is read or written - * @is_write: true if userspace is writing a register - */ -static int vgic_v3_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u64 *reg, bool is_write) -{ - struct vgic_reg_attr reg_attr; - gpa_t addr; - struct kvm_vcpu *vcpu; - int ret; - u32 tmp32; - - ret = vgic_v3_parse_attr(dev, attr, ®_attr); - if (ret) - return ret; - - vcpu = reg_attr.vcpu; - addr = reg_attr.addr; - - mutex_lock(&dev->kvm->lock); - - if (unlikely(!vgic_initialized(dev->kvm))) { - ret = -EBUSY; - goto out; - } - - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; - break; - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; - break; - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 regid; - - regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write, - regid, reg); - break; - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - unsigned int info, intid; - - info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> - KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; - if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { - intid = attr->attr & - KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; - ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, - intid, reg); - } else { - ret = -EINVAL; - } - break; - } - default: - ret = -EINVAL; - break; - } - - unlock_all_vcpus(dev->kvm); -out: - mutex_unlock(&dev->kvm->lock); - return ret; -} - -static int vgic_v3_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 tmp32; - u64 reg; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - int ret; - - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: - mutex_lock(&dev->kvm->lock); - - if (!lock_all_vcpus(dev->kvm)) { - mutex_unlock(&dev->kvm->lock); - return -EBUSY; - } - ret = vgic_v3_save_pending_tables(dev->kvm); - unlock_all_vcpus(dev->kvm); - mutex_unlock(&dev->kvm->lock); - return ret; - } - break; - } - } - return -ENXIO; -} - -static int vgic_v3_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); - } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); - } - } - return -ENXIO; -} - -static int vgic_v3_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V3_ADDR_TYPE_DIST: - case KVM_VGIC_V3_ADDR_TYPE_REDIST: - case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: - return vgic_v3_has_attr_regs(dev, attr); - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> - KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) == - VGIC_LEVEL_INFO_LINE_LEVEL) - return 0; - break; - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v3_ops = { - .name = "kvm-arm-vgic-v3", - .create = vgic_create, - .destroy = vgic_destroy, - .set_attr = vgic_v3_set_attr, - .get_attr = vgic_v3_get_attr, - .has_attr = vgic_v3_has_attr, -}; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c deleted file mode 100644 index 738b65d2d0e7..000000000000 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ /dev/null @@ -1,554 +0,0 @@ -/* - * VGICv2 MMIO handling functions - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/irqchip/arm-gic.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/nospec.h> - -#include <kvm/iodev.h> -#include <kvm/arm_vgic.h> - -#include "vgic.h" -#include "vgic-mmio.h" - -/* - * The Revision field in the IIDR have the following meanings: - * - * Revision 1: Report GICv2 interrupts as group 0 instead of group 1 - * Revision 2: Interrupt groups are guest-configurable and signaled using - * their configured groups. - */ - -static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; - u32 value; - - switch (addr & 0x0c) { - case GIC_DIST_CTRL: - value = vgic->enabled ? GICD_ENABLE : 0; - break; - case GIC_DIST_CTR: - value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; - value = (value >> 5) - 1; - value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; - break; - case GIC_DIST_IIDR: - value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | - (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | - (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); - break; - default: - return 0; - } - - return value; -} - -static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - bool was_enabled = dist->enabled; - - switch (addr & 0x0c) { - case GIC_DIST_CTRL: - dist->enabled = val & GICD_ENABLE; - if (!was_enabled && dist->enabled) - vgic_kick_vcpus(vcpu->kvm); - break; - case GIC_DIST_CTR: - case GIC_DIST_IIDR: - /* Nothing to do */ - return; - } -} - -static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - switch (addr & 0x0c) { - case GIC_DIST_IIDR: - if (val != vgic_mmio_read_v2_misc(vcpu, addr, len)) - return -EINVAL; - - /* - * If we observe a write to GICD_IIDR we know that userspace - * has been updated and has had a chance to cope with older - * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting - * interrupts as group 1, and therefore we now allow groups to - * be user writable. Doing this by default would break - * migration from old kernels to new kernels with legacy - * userspace. - */ - vcpu->kvm->arch.vgic.v2_groups_user_writable = true; - return 0; - } - - vgic_mmio_write_v2_misc(vcpu, addr, len, val); - return 0; -} - -static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - if (vcpu->kvm->arch.vgic.v2_groups_user_writable) - vgic_mmio_write_group(vcpu, addr, len, val); - - return 0; -} - -static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus); - int intid = val & 0xf; - int targets = (val >> 16) & 0xff; - int mode = (val >> 24) & 0x03; - int c; - struct kvm_vcpu *vcpu; - unsigned long flags; - - switch (mode) { - case 0x0: /* as specified by targets */ - break; - case 0x1: - targets = (1U << nr_vcpus) - 1; /* all, ... */ - targets &= ~(1U << source_vcpu->vcpu_id); /* but self */ - break; - case 0x2: /* this very vCPU only */ - targets = (1U << source_vcpu->vcpu_id); - break; - case 0x3: /* reserved */ - return; - } - - kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) { - struct vgic_irq *irq; - - if (!(targets & (1U << c))) - continue; - - irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); - - spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - irq->source |= 1U << source_vcpu->vcpu_id; - - vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags); - vgic_put_irq(source_vcpu->kvm, irq); - } -} - -static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - u64 val = 0; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - val |= (u64)irq->targets << (i * 8); - - vgic_put_irq(vcpu->kvm, irq); - } - - return val; -} - -static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0); - int i; - unsigned long flags; - - /* GICD_ITARGETSR[0-7] are read-only */ - if (intid < VGIC_NR_PRIVATE_IRQS) - return; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); - int target; - - spin_lock_irqsave(&irq->irq_lock, flags); - - irq->targets = (val >> (i * 8)) & cpu_mask; - target = irq->targets ? __ffs(irq->targets) : 0; - irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); - - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = addr & 0x0f; - int i; - u64 val = 0; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - val |= (u64)irq->source << (i * 8); - - vgic_put_irq(vcpu->kvm, irq); - } - return val; -} - -static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = addr & 0x0f; - int i; - unsigned long flags; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock_irqsave(&irq->irq_lock, flags); - - irq->source &= ~((val >> (i * 8)) & 0xff); - if (!irq->source) - irq->pending_latch = false; - - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = addr & 0x0f; - int i; - unsigned long flags; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock_irqsave(&irq->irq_lock, flags); - - irq->source |= (val >> (i * 8)) & 0xff; - - if (irq->source) { - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } else { - spin_unlock_irqrestore(&irq->irq_lock, flags); - } - vgic_put_irq(vcpu->kvm, irq); - } -} - -#define GICC_ARCH_VERSION_V2 0x2 - -/* These are for userland accesses only, there is no guest-facing emulation. */ -static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_vmcr vmcr; - u32 val; - - vgic_get_vmcr(vcpu, &vmcr); - - switch (addr & 0xff) { - case GIC_CPU_CTRL: - val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT; - val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT; - val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT; - val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT; - val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT; - val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT; - - break; - case GIC_CPU_PRIMASK: - /* - * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than - * GICC_PMR.Priority, so we expose the upper five bits of - * priority mask to userspace using the lower bits in the - * unsigned long. - */ - val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >> - GICV_PMR_PRIORITY_SHIFT; - break; - case GIC_CPU_BINPOINT: - val = vmcr.bpr; - break; - case GIC_CPU_ALIAS_BINPOINT: - val = vmcr.abpr; - break; - case GIC_CPU_IDENT: - val = ((PRODUCT_ID_KVM << 20) | - (GICC_ARCH_VERSION_V2 << 16) | - IMPLEMENTER_ARM); - break; - default: - return 0; - } - - return val; -} - -static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_vmcr vmcr; - - vgic_get_vmcr(vcpu, &vmcr); - - switch (addr & 0xff) { - case GIC_CPU_CTRL: - vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0); - vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1); - vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl); - vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn); - vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR); - vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS); - - break; - case GIC_CPU_PRIMASK: - /* - * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than - * GICC_PMR.Priority, so we expose the upper five bits of - * priority mask to userspace using the lower bits in the - * unsigned long. - */ - vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) & - GICV_PMR_PRIORITY_MASK; - break; - case GIC_CPU_BINPOINT: - vmcr.bpr = val; - break; - case GIC_CPU_ALIAS_BINPOINT: - vmcr.abpr = val; - break; - } - - vgic_set_vmcr(vcpu, &vmcr); -} - -static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - int n; /* which APRn is this */ - - n = (addr >> 2) & 0x3; - - if (kvm_vgic_global_state.type == VGIC_V2) { - /* GICv2 hardware systems support max. 32 groups */ - if (n != 0) - return 0; - return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr; - } else { - struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - - if (n > vgic_v3_max_apr_idx(vcpu)) - return 0; - - n = array_index_nospec(n, 4); - - /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ - return vgicv3->vgic_ap1r[n]; - } -} - -static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - int n; /* which APRn is this */ - - n = (addr >> 2) & 0x3; - - if (kvm_vgic_global_state.type == VGIC_V2) { - /* GICv2 hardware systems support max. 32 groups */ - if (n != 0) - return; - vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val; - } else { - struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - - if (n > vgic_v3_max_apr_idx(vcpu)) - return; - - n = array_index_nospec(n, 4); - - /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ - vgicv3->vgic_ap1r[n] = val; - } -} - -static const struct vgic_register_region vgic_v2_dist_registers[] = { - REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL, - vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, - NULL, vgic_mmio_uaccess_write_v2_misc, - 12, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP, - vgic_mmio_read_group, vgic_mmio_write_group, - NULL, vgic_mmio_uaccess_write_v2_group, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, - vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR, - vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, - vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, - vgic_mmio_read_pending, vgic_mmio_write_cpending, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, - vgic_mmio_read_active, vgic_mmio_write_sactive, - NULL, vgic_mmio_uaccess_write_sactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR, - vgic_mmio_read_active, vgic_mmio_write_cactive, - NULL, vgic_mmio_uaccess_write_cactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI, - vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, - 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET, - vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG, - vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT, - vgic_mmio_read_raz, vgic_mmio_write_sgir, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR, - vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET, - vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), -}; - -static const struct vgic_register_region vgic_v2_cpu_registers[] = { - REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO, - vgic_mmio_read_apr, vgic_mmio_write_apr, 16, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), -}; - -unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) -{ - dev->regions = vgic_v2_dist_registers; - dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); - - kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); - - return SZ_4K; -} - -int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) -{ - const struct vgic_register_region *region; - struct vgic_io_device iodev; - struct vgic_reg_attr reg_attr; - struct kvm_vcpu *vcpu; - gpa_t addr; - int ret; - - ret = vgic_v2_parse_attr(dev, attr, ®_attr); - if (ret) - return ret; - - vcpu = reg_attr.vcpu; - addr = reg_attr.addr; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - iodev.regions = vgic_v2_dist_registers; - iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); - iodev.base_addr = 0; - break; - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - iodev.regions = vgic_v2_cpu_registers; - iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); - iodev.base_addr = 0; - break; - default: - return -ENXIO; - } - - /* We only support aligned 32-bit accesses. */ - if (addr & 3) - return -ENXIO; - - region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); - if (!region) - return -ENXIO; - - return 0; -} - -int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device dev = { - .regions = vgic_v2_cpu_registers, - .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers), - .iodev_type = IODEV_CPUIF, - }; - - return vgic_uaccess(vcpu, &dev, is_write, offset, val); -} - -int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device dev = { - .regions = vgic_v2_dist_registers, - .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers), - .iodev_type = IODEV_DIST, - }; - - return vgic_uaccess(vcpu, &dev, is_write, offset, val); -} diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c deleted file mode 100644 index b3d1f0985117..000000000000 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ /dev/null @@ -1,1022 +0,0 @@ -/* - * VGICv3 MMIO handling functions - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/irqchip/arm-gic-v3.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <kvm/iodev.h> -#include <kvm/arm_vgic.h> - -#include <asm/kvm_emulate.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_mmu.h> - -#include "vgic.h" -#include "vgic-mmio.h" - -/* extract @num bytes at @offset bytes offset in data */ -unsigned long extract_bytes(u64 data, unsigned int offset, - unsigned int num) -{ - return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); -} - -/* allows updates of any half of a 64-bit register (or the whole thing) */ -u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, - unsigned long val) -{ - int lower = (offset & 4) * 8; - int upper = lower + 8 * len - 1; - - reg &= ~GENMASK_ULL(upper, lower); - val &= GENMASK_ULL(len * 8 - 1, 0); - - return reg | ((u64)val << lower); -} - -bool vgic_has_its(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) - return false; - - return dist->has_its; -} - -bool vgic_supports_direct_msis(struct kvm *kvm) -{ - return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm); -} - -/* - * The Revision field in the IIDR have the following meanings: - * - * Revision 2: Interrupt groups are guest-configurable and signaled using - * their configured groups. - */ - -static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; - u32 value = 0; - - switch (addr & 0x0c) { - case GICD_CTLR: - if (vgic->enabled) - value |= GICD_CTLR_ENABLE_SS_G1; - value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; - break; - case GICD_TYPER: - value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; - value = (value >> 5) - 1; - if (vgic_has_its(vcpu->kvm)) { - value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; - value |= GICD_TYPER_LPIS; - } else { - value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; - } - break; - case GICD_IIDR: - value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | - (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | - (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); - break; - default: - return 0; - } - - return value; -} - -static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - bool was_enabled = dist->enabled; - - switch (addr & 0x0c) { - case GICD_CTLR: - dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; - - if (!was_enabled && dist->enabled) - vgic_kick_vcpus(vcpu->kvm); - break; - case GICD_TYPER: - case GICD_IIDR: - return; - } -} - -static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - switch (addr & 0x0c) { - case GICD_IIDR: - if (val != vgic_mmio_read_v3_misc(vcpu, addr, len)) - return -EINVAL; - } - - vgic_mmio_write_v3_misc(vcpu, addr, len, val); - return 0; -} - -static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); - unsigned long ret = 0; - - if (!irq) - return 0; - - /* The upper word is RAZ for us. */ - if (!(addr & 4)) - ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); - - vgic_put_irq(vcpu->kvm, irq); - return ret; -} - -static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq; - unsigned long flags; - - /* The upper word is WI for us since we don't implement Aff3. */ - if (addr & 4) - return; - - irq = vgic_get_irq(vcpu->kvm, NULL, intid); - - if (!irq) - return; - - spin_lock_irqsave(&irq->irq_lock, flags); - - /* We only care about and preserve Aff0, Aff1 and Aff2. */ - irq->mpidr = val & GENMASK(23, 0); - irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); - - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); -} - -static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0; -} - - -static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - bool was_enabled = vgic_cpu->lpis_enabled; - - if (!vgic_has_its(vcpu->kvm)) - return; - - vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; - - if (!was_enabled && vgic_cpu->lpis_enabled) - vgic_enable_lpis(vcpu); -} - -static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_redist_region *rdreg = vgic_cpu->rdreg; - int target_vcpu_id = vcpu->vcpu_id; - gpa_t last_rdist_typer = rdreg->base + GICR_TYPER + - (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE; - u64 value; - - value = (u64)(mpidr & GENMASK(23, 0)) << 32; - value |= ((target_vcpu_id & 0xffff) << 8); - - if (addr == last_rdist_typer) - value |= GICR_TYPER_LAST; - if (vgic_has_its(vcpu->kvm)) - value |= GICR_TYPER_PLPIS; - - return extract_bytes(value, addr & 7, len); -} - -static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); -} - -static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - switch (addr & 0xffff) { - case GICD_PIDR2: - /* report a GICv3 compliant implementation */ - return 0x3b; - } - - return 0; -} - -static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* - * pending state of interrupt is latched in pending_latch variable. - * Userspace will save and restore pending state and line_level - * separately. - * Refer to Documentation/virtual/kvm/devices/arm-vgic-v3.txt - * for handling of ISPENDR and ICPENDR. - */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->pending_latch) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock_irqsave(&irq->irq_lock, flags); - if (test_bit(i, &val)) { - /* - * pending_latch is set irrespective of irq type - * (level or edge) to avoid dependency that VM should - * restore irq config before pending info. - */ - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } else { - irq->pending_latch = false; - spin_unlock_irqrestore(&irq->irq_lock, flags); - } - - vgic_put_irq(vcpu->kvm, irq); - } - - return 0; -} - -/* We want to avoid outer shareable. */ -u64 vgic_sanitise_shareability(u64 field) -{ - switch (field) { - case GIC_BASER_OuterShareable: - return GIC_BASER_InnerShareable; - default: - return field; - } -} - -/* Avoid any inner non-cacheable mapping. */ -u64 vgic_sanitise_inner_cacheability(u64 field) -{ - switch (field) { - case GIC_BASER_CACHE_nCnB: - case GIC_BASER_CACHE_nC: - return GIC_BASER_CACHE_RaWb; - default: - return field; - } -} - -/* Non-cacheable or same-as-inner are OK. */ -u64 vgic_sanitise_outer_cacheability(u64 field) -{ - switch (field) { - case GIC_BASER_CACHE_SameAsInner: - case GIC_BASER_CACHE_nC: - return field; - default: - return GIC_BASER_CACHE_nC; - } -} - -u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, - u64 (*sanitise_fn)(u64)) -{ - u64 field = (reg & field_mask) >> field_shift; - - field = sanitise_fn(field) << field_shift; - return (reg & ~field_mask) | field; -} - -#define PROPBASER_RES0_MASK \ - (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5)) -#define PENDBASER_RES0_MASK \ - (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \ - GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0)) - -static u64 vgic_sanitise_pendbaser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK, - GICR_PENDBASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK, - GICR_PENDBASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK, - GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - reg &= ~PENDBASER_RES0_MASK; - - return reg; -} - -static u64 vgic_sanitise_propbaser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK, - GICR_PROPBASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK, - GICR_PROPBASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK, - GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - reg &= ~PROPBASER_RES0_MASK; - return reg; -} - -static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return extract_bytes(dist->propbaser, addr & 7, len); -} - -static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u64 old_propbaser, propbaser; - - /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) - return; - - do { - old_propbaser = READ_ONCE(dist->propbaser); - propbaser = old_propbaser; - propbaser = update_64bit_reg(propbaser, addr & 4, len, val); - propbaser = vgic_sanitise_propbaser(propbaser); - } while (cmpxchg64(&dist->propbaser, old_propbaser, - propbaser) != old_propbaser); -} - -static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - return extract_bytes(vgic_cpu->pendbaser, addr & 7, len); -} - -static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u64 old_pendbaser, pendbaser; - - /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) - return; - - do { - old_pendbaser = READ_ONCE(vgic_cpu->pendbaser); - pendbaser = old_pendbaser; - pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); - pendbaser = vgic_sanitise_pendbaser(pendbaser); - } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser, - pendbaser) != old_pendbaser); -} - -/* - * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the - * redistributors, while SPIs are covered by registers in the distributor - * block. Trying to set private IRQs in this block gets ignored. - * We take some special care here to fix the calculation of the register - * offset. - */ -#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = bpi, \ - .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ - .access_flags = acc, \ - .read = vgic_mmio_read_raz, \ - .write = vgic_mmio_write_wi, \ - }, { \ - .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ - .bits_per_irq = bpi, \ - .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - .uaccess_read = ur, \ - .uaccess_write = uw, \ - } - -static const struct vgic_register_region vgic_v3_dist_registers[] = { - REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR, - vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, - NULL, vgic_mmio_uaccess_write_v3_misc, - 16, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICD_STATUSR, - vgic_mmio_read_rao, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, - vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, - vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, - vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, - vgic_mmio_read_pending, vgic_mmio_write_spending, - vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, - vgic_mmio_read_pending, vgic_mmio_write_cpending, - vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, - vgic_mmio_read_active, vgic_mmio_write_sactive, - NULL, vgic_mmio_uaccess_write_sactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, - vgic_mmio_read_active, vgic_mmio_write_cactive, - NULL, vgic_mmio_uaccess_write_cactive, - 1, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, - vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, - 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, - vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, - vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, - vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, - vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, - vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, - VGIC_ACCESS_32bit), -}; - -static const struct vgic_register_region vgic_v3_rdbase_registers[] = { - REGISTER_DESC_WITH_LENGTH(GICR_CTLR, - vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_STATUSR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IIDR, - vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_TYPER, - vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_WAKER, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, - vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, - vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, - vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, - VGIC_ACCESS_32bit), -}; - -static const struct vgic_register_region vgic_v3_sgibase_registers[] = { - REGISTER_DESC_WITH_LENGTH(GICR_IGROUPR0, - vgic_mmio_read_group, vgic_mmio_write_group, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ISENABLER0, - vgic_mmio_read_enable, vgic_mmio_write_senable, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0, - vgic_mmio_read_enable, vgic_mmio_write_cenable, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISPENDR0, - vgic_mmio_read_pending, vgic_mmio_write_spending, - vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICPENDR0, - vgic_mmio_read_pending, vgic_mmio_write_cpending, - vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISACTIVER0, - vgic_mmio_read_active, vgic_mmio_write_sactive, - NULL, vgic_mmio_uaccess_write_sactive, - 4, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICACTIVER0, - vgic_mmio_read_active, vgic_mmio_write_cactive, - NULL, vgic_mmio_uaccess_write_cactive, - 4, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0, - vgic_mmio_read_priority, vgic_mmio_write_priority, 32, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_LENGTH(GICR_ICFGR0, - vgic_mmio_read_config, vgic_mmio_write_config, 8, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IGRPMODR0, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_NSACR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), -}; - -unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) -{ - dev->regions = vgic_v3_dist_registers; - dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); - - kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); - - return SZ_64K; -} - -/** - * vgic_register_redist_iodev - register a single redist iodev - * @vcpu: The VCPU to which the redistributor belongs - * - * Register a KVM iodev for this VCPU's redistributor using the address - * provided. - * - * Return 0 on success, -ERRNO otherwise. - */ -int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - struct vgic_dist *vgic = &kvm->arch.vgic; - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; - struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev; - struct vgic_redist_region *rdreg; - gpa_t rd_base, sgi_base; - int ret; - - if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) - return 0; - - /* - * We may be creating VCPUs before having set the base address for the - * redistributor region, in which case we will come back to this - * function for all VCPUs when the base address is set. Just return - * without doing any work for now. - */ - rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions); - if (!rdreg) - return 0; - - if (!vgic_v3_check_base(kvm)) - return -EINVAL; - - vgic_cpu->rdreg = rdreg; - - rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE; - sgi_base = rd_base + SZ_64K; - - kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); - rd_dev->base_addr = rd_base; - rd_dev->iodev_type = IODEV_REDIST; - rd_dev->regions = vgic_v3_rdbase_registers; - rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers); - rd_dev->redist_vcpu = vcpu; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, - SZ_64K, &rd_dev->dev); - mutex_unlock(&kvm->slots_lock); - - if (ret) - return ret; - - kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops); - sgi_dev->base_addr = sgi_base; - sgi_dev->iodev_type = IODEV_REDIST; - sgi_dev->regions = vgic_v3_sgibase_registers; - sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers); - sgi_dev->redist_vcpu = vcpu; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, sgi_base, - SZ_64K, &sgi_dev->dev); - if (ret) { - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &rd_dev->dev); - goto out; - } - - rdreg->free_index++; -out: - mutex_unlock(&kvm->slots_lock); - return ret; -} - -static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) -{ - struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; - struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev; - - kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev); - kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &sgi_dev->dev); -} - -static int vgic_register_all_redist_iodevs(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int c, ret = 0; - - kvm_for_each_vcpu(c, vcpu, kvm) { - ret = vgic_register_redist_iodev(vcpu); - if (ret) - break; - } - - if (ret) { - /* The current c failed, so we start with the previous one. */ - mutex_lock(&kvm->slots_lock); - for (c--; c >= 0; c--) { - vcpu = kvm_get_vcpu(kvm, c); - vgic_unregister_redist_iodev(vcpu); - } - mutex_unlock(&kvm->slots_lock); - } - - return ret; -} - -/** - * vgic_v3_insert_redist_region - Insert a new redistributor region - * - * Performs various checks before inserting the rdist region in the list. - * Those tests depend on whether the size of the rdist region is known - * (ie. count != 0). The list is sorted by rdist region index. - * - * @kvm: kvm handle - * @index: redist region index - * @base: base of the new rdist region - * @count: number of redistributors the region is made of (0 in the old style - * single region, whose size is induced from the number of vcpus) - * - * Return 0 on success, < 0 otherwise - */ -static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index, - gpa_t base, uint32_t count) -{ - struct vgic_dist *d = &kvm->arch.vgic; - struct vgic_redist_region *rdreg; - struct list_head *rd_regions = &d->rd_regions; - size_t size = count * KVM_VGIC_V3_REDIST_SIZE; - int ret; - - /* single rdist region already set ?*/ - if (!count && !list_empty(rd_regions)) - return -EINVAL; - - /* cross the end of memory ? */ - if (base + size < base) - return -EINVAL; - - if (list_empty(rd_regions)) { - if (index != 0) - return -EINVAL; - } else { - rdreg = list_last_entry(rd_regions, - struct vgic_redist_region, list); - if (index != rdreg->index + 1) - return -EINVAL; - - /* Cannot add an explicitly sized regions after legacy region */ - if (!rdreg->count) - return -EINVAL; - } - - /* - * For legacy single-region redistributor regions (!count), - * check that the redistributor region does not overlap with the - * distributor's address space. - */ - if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && - vgic_dist_overlap(kvm, base, size)) - return -EINVAL; - - /* collision with any other rdist region? */ - if (vgic_v3_rdist_overlap(kvm, base, size)) - return -EINVAL; - - rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL); - if (!rdreg) - return -ENOMEM; - - rdreg->base = VGIC_ADDR_UNDEF; - - ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K); - if (ret) - goto free; - - rdreg->base = base; - rdreg->count = count; - rdreg->free_index = 0; - rdreg->index = index; - - list_add_tail(&rdreg->list, rd_regions); - return 0; -free: - kfree(rdreg); - return ret; -} - -int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) -{ - int ret; - - ret = vgic_v3_insert_redist_region(kvm, index, addr, count); - if (ret) - return ret; - - /* - * Register iodevs for each existing VCPU. Adding more VCPUs - * afterwards will register the iodevs when needed. - */ - ret = vgic_register_all_redist_iodevs(kvm); - if (ret) - return ret; - - return 0; -} - -int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) -{ - const struct vgic_register_region *region; - struct vgic_io_device iodev; - struct vgic_reg_attr reg_attr; - struct kvm_vcpu *vcpu; - gpa_t addr; - int ret; - - ret = vgic_v3_parse_attr(dev, attr, ®_attr); - if (ret) - return ret; - - vcpu = reg_attr.vcpu; - addr = reg_attr.addr; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - iodev.regions = vgic_v3_dist_registers; - iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); - iodev.base_addr = 0; - break; - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{ - iodev.regions = vgic_v3_rdbase_registers; - iodev.nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers); - iodev.base_addr = 0; - break; - } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 reg, id; - - id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, ®); - } - default: - return -ENXIO; - } - - /* We only support aligned 32-bit accesses. */ - if (addr & 3) - return -ENXIO; - - region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); - if (!region) - return -ENXIO; - - return 0; -} -/* - * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI - * generation register ICC_SGI1R_EL1) with a given VCPU. - * If the VCPU's MPIDR matches, return the level0 affinity, otherwise - * return -1. - */ -static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) -{ - unsigned long affinity; - int level0; - - /* - * Split the current VCPU's MPIDR into affinity level 0 and the - * rest as this is what we have to compare against. - */ - affinity = kvm_vcpu_get_mpidr_aff(vcpu); - level0 = MPIDR_AFFINITY_LEVEL(affinity, 0); - affinity &= ~MPIDR_LEVEL_MASK; - - /* bail out if the upper three levels don't match */ - if (sgi_aff != affinity) - return -1; - - /* Is this VCPU's bit set in the mask ? */ - if (!(sgi_cpu_mask & BIT(level0))) - return -1; - - return level0; -} - -/* - * The ICC_SGI* registers encode the affinity differently from the MPIDR, - * so provide a wrapper to use the existing defines to isolate a certain - * affinity level. - */ -#define SGI_AFFINITY_LEVEL(reg, level) \ - ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ - >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) - -/** - * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs - * @vcpu: The VCPU requesting a SGI - * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU - * @allow_group1: Does the sysreg access allow generation of G1 SGIs - * - * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. - * This will trap in sys_regs.c and call this function. - * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the - * target processors as well as a bitmask of 16 Aff0 CPUs. - * If the interrupt routing mode bit is not set, we iterate over all VCPUs to - * check for matching ones. If this bit is set, we signal all, but not the - * calling VCPU. - */ -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *c_vcpu; - u16 target_cpus; - u64 mpidr; - int sgi, c; - int vcpu_id = vcpu->vcpu_id; - bool broadcast; - unsigned long flags; - - sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; - broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); - target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; - mpidr = SGI_AFFINITY_LEVEL(reg, 3); - mpidr |= SGI_AFFINITY_LEVEL(reg, 2); - mpidr |= SGI_AFFINITY_LEVEL(reg, 1); - - /* - * We iterate over all VCPUs to find the MPIDRs matching the request. - * If we have handled one CPU, we clear its bit to detect early - * if we are already finished. This avoids iterating through all - * VCPUs when most of the times we just signal a single VCPU. - */ - kvm_for_each_vcpu(c, c_vcpu, kvm) { - struct vgic_irq *irq; - - /* Exit early if we have dealt with all requested CPUs */ - if (!broadcast && target_cpus == 0) - break; - - /* Don't signal the calling VCPU */ - if (broadcast && c == vcpu_id) - continue; - - if (!broadcast) { - int level0; - - level0 = match_mpidr(mpidr, target_cpus, c_vcpu); - if (level0 == -1) - continue; - - /* remove this matching VCPU from the mask */ - target_cpus &= ~BIT(level0); - } - - irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); - - spin_lock_irqsave(&irq->irq_lock, flags); - - /* - * An access targetting Group0 SGIs can only generate - * those, while an access targetting Group1 SGIs can - * generate interrupts of either group. - */ - if (!irq->group || allow_group1) { - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } else { - spin_unlock_irqrestore(&irq->irq_lock, flags); - } - - vgic_put_irq(vcpu->kvm, irq); - } -} - -int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device dev = { - .regions = vgic_v3_dist_registers, - .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers), - }; - - return vgic_uaccess(vcpu, &dev, is_write, offset, val); -} - -int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device rd_dev = { - .regions = vgic_v3_rdbase_registers, - .nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers), - }; - - struct vgic_io_device sgi_dev = { - .regions = vgic_v3_sgibase_registers, - .nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers), - }; - - /* SGI_base is the next 64K frame after RD_base */ - if (offset >= SZ_64K) - return vgic_uaccess(vcpu, &sgi_dev, is_write, offset - SZ_64K, - val); - else - return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); -} - -int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val) -{ - if (intid % 32) - return -EINVAL; - - if (is_write) - vgic_write_irq_line_level_info(vcpu, intid, *val); - else - *val = vgic_read_irq_line_level_info(vcpu, intid); - - return 0; -} diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c deleted file mode 100644 index ceeda7e04a4d..000000000000 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ /dev/null @@ -1,893 +0,0 @@ -/* - * VGIC MMIO handling functions - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/bitops.h> -#include <linux/bsearch.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <kvm/iodev.h> -#include <kvm/arm_arch_timer.h> -#include <kvm/arm_vgic.h> - -#include "vgic.h" -#include "vgic-mmio.h" - -unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return 0; -} - -unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return -1UL; -} - -void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val) -{ - /* Ignore */ -} - -int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val) -{ - /* Ignore */ - return 0; -} - -unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->group) - value |= BIT(i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock_irqsave(&irq->irq_lock, flags); - irq->group = !!(val & BIT(i)); - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - - vgic_put_irq(vcpu->kvm, irq); - } -} - -/* - * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value - * of the enabled bit, so there is only one function for both here. - */ -unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->enabled) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock_irqsave(&irq->irq_lock, flags); - irq->enabled = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - - vgic_put_irq(vcpu->kvm, irq); - } -} - -void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock_irqsave(&irq->irq_lock, flags); - - irq->enabled = false; - - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - unsigned long flags; - - spin_lock_irqsave(&irq->irq_lock, flags); - if (irq_is_pending(irq)) - value |= (1U << i); - spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -/* - * This function will return the VCPU that performed the MMIO access and - * trapped from within the VM, and will return NULL if this is a userspace - * access. - * - * We can disable preemption locally around accessing the per-CPU variable, - * and use the resolved vcpu pointer after enabling preemption again, because - * even if the current thread is migrated to another CPU, reading the per-CPU - * value later will give us the same value as we update the per-CPU variable - * in the preempt notifier handlers. - */ -static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void) -{ - struct kvm_vcpu *vcpu; - - preempt_disable(); - vcpu = kvm_arm_get_running_vcpu(); - preempt_enable(); - return vcpu; -} - -/* Must be called with irq->irq_lock held */ -static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool is_uaccess) -{ - if (is_uaccess) - return; - - irq->pending_latch = true; - vgic_irq_set_phys_active(irq, true); -} - -void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - bool is_uaccess = !vgic_get_mmio_requester_vcpu(); - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock_irqsave(&irq->irq_lock, flags); - if (irq->hw) - vgic_hw_irq_spending(vcpu, irq, is_uaccess); - else - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -/* Must be called with irq->irq_lock held */ -static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool is_uaccess) -{ - if (is_uaccess) - return; - - irq->pending_latch = false; - - /* - * We don't want the guest to effectively mask the physical - * interrupt by doing a write to SPENDR followed by a write to - * CPENDR for HW interrupts, so we clear the active state on - * the physical side if the virtual interrupt is not active. - * This may lead to taking an additional interrupt on the - * host, but that should not be a problem as the worst that - * can happen is an additional vgic injection. We also clear - * the pending state to maintain proper semantics for edge HW - * interrupts. - */ - vgic_irq_set_phys_pending(irq, false); - if (!irq->active) - vgic_irq_set_phys_active(irq, false); -} - -void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - bool is_uaccess = !vgic_get_mmio_requester_vcpu(); - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock_irqsave(&irq->irq_lock, flags); - - if (irq->hw) - vgic_hw_irq_cpending(vcpu, irq, is_uaccess); - else - irq->pending_latch = false; - - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->active) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -/* Must be called with irq->irq_lock held */ -static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool active, bool is_uaccess) -{ - if (is_uaccess) - return; - - irq->active = active; - vgic_irq_set_phys_active(irq, active); -} - -static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool active) -{ - unsigned long flags; - struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu(); - - spin_lock_irqsave(&irq->irq_lock, flags); - - if (irq->hw) { - vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); - } else { - u32 model = vcpu->kvm->arch.vgic.vgic_model; - u8 active_source; - - irq->active = active; - - /* - * The GICv2 architecture indicates that the source CPUID for - * an SGI should be provided during an EOI which implies that - * the active state is stored somewhere, but at the same time - * this state is not architecturally exposed anywhere and we - * have no way of knowing the right source. - * - * This may lead to a VCPU not being able to receive - * additional instances of a particular SGI after migration - * for a GICv2 VM on some GIC implementations. Oh well. - */ - active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && - active && vgic_irq_is_sgi(irq->intid)) - irq->active_source = active_source; - } - - if (irq->active) - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - else - spin_unlock_irqrestore(&irq->irq_lock, flags); -} - -/* - * If we are fiddling with an IRQ's active state, we have to make sure the IRQ - * is not queued on some running VCPU's LRs, because then the change to the - * active state can be overwritten when the VCPU's state is synced coming back - * from the guest. - * - * For shared interrupts, we have to stop all the VCPUs because interrupts can - * be migrated while we don't hold the IRQ locks and we don't want to be - * chasing moving targets. - * - * For private interrupts we don't have to do anything because userspace - * accesses to the VGIC state already require all VCPUs to be stopped, and - * only the VCPU itself can modify its private interrupts active state, which - * guarantees that the VCPU is not running. - */ -static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) -{ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || - intid > VGIC_NR_PRIVATE_IRQS) - kvm_arm_halt_guest(vcpu->kvm); -} - -/* See vgic_change_active_prepare */ -static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) -{ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || - intid > VGIC_NR_PRIVATE_IRQS) - kvm_arm_resume_guest(vcpu->kvm); -} - -static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - vgic_mmio_change_active(vcpu, irq, false); - vgic_put_irq(vcpu->kvm, irq); - } -} - -void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - - mutex_lock(&vcpu->kvm->lock); - vgic_change_active_prepare(vcpu, intid); - - __vgic_mmio_write_cactive(vcpu, addr, len, val); - - vgic_change_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); -} - -int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - __vgic_mmio_write_cactive(vcpu, addr, len, val); - return 0; -} - -static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - vgic_mmio_change_active(vcpu, irq, true); - vgic_put_irq(vcpu->kvm, irq); - } -} - -void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - - mutex_lock(&vcpu->kvm->lock); - vgic_change_active_prepare(vcpu, intid); - - __vgic_mmio_write_sactive(vcpu, addr, len, val); - - vgic_change_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); -} - -int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - __vgic_mmio_write_sactive(vcpu, addr, len, val); - return 0; -} - -unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - u64 val = 0; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - val |= (u64)irq->priority << (i * 8); - - vgic_put_irq(vcpu->kvm, irq); - } - - return val; -} - -/* - * We currently don't handle changing the priority of an interrupt that - * is already pending on a VCPU. If there is a need for this, we would - * need to make this VCPU exit and re-evaluate the priorities, potentially - * leading to this interrupt getting presented now to the guest (if it has - * been masked by the priority mask before). - */ -void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - unsigned long flags; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock_irqsave(&irq->irq_lock, flags); - /* Narrow the priority range to what we actually support */ - irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); - spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } -} - -unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 2); - u32 value = 0; - int i; - - for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->config == VGIC_CONFIG_EDGE) - value |= (2U << (i * 2)); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -void vgic_mmio_write_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 2); - int i; - unsigned long flags; - - for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq; - - /* - * The configuration cannot be changed for SGIs in general, - * for PPIs this is IMPLEMENTATION DEFINED. The arch timer - * code relies on PPIs being level triggered, so we also - * make them read-only here. - */ - if (intid + i < VGIC_NR_PRIVATE_IRQS) - continue; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); - - if (test_bit(i * 2 + 1, &val)) - irq->config = VGIC_CONFIG_EDGE; - else - irq->config = VGIC_CONFIG_LEVEL; - - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - } -} - -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) -{ - int i; - u64 val = 0; - int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - - for (i = 0; i < 32; i++) { - struct vgic_irq *irq; - - if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) - continue; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) - val |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return val; -} - -void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val) -{ - int i; - int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - unsigned long flags; - - for (i = 0; i < 32; i++) { - struct vgic_irq *irq; - bool new_level; - - if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) - continue; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - /* - * Line level is set irrespective of irq type - * (level or edge) to avoid dependency that VM should - * restore irq config before line level. - */ - new_level = !!(val & (1U << i)); - spin_lock_irqsave(&irq->irq_lock, flags); - irq->line_level = new_level; - if (new_level) - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - else - spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } -} - -static int match_region(const void *key, const void *elt) -{ - const unsigned int offset = (unsigned long)key; - const struct vgic_register_region *region = elt; - - if (offset < region->reg_offset) - return -1; - - if (offset >= region->reg_offset + region->len) - return 1; - - return 0; -} - -const struct vgic_register_region * -vgic_find_mmio_region(const struct vgic_register_region *regions, - int nr_regions, unsigned int offset) -{ - return bsearch((void *)(uintptr_t)offset, regions, nr_regions, - sizeof(regions[0]), match_region); -} - -void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_vmcr(vcpu, vmcr); - else - vgic_v3_set_vmcr(vcpu, vmcr); -} - -void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_get_vmcr(vcpu, vmcr); - else - vgic_v3_get_vmcr(vcpu, vmcr); -} - -/* - * kvm_mmio_read_buf() returns a value in a format where it can be converted - * to a byte array and be directly observed as the guest wanted it to appear - * in memory if it had done the store itself, which is LE for the GIC, as the - * guest knows the GIC is always LE. - * - * We convert this value to the CPUs native format to deal with it as a data - * value. - */ -unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len) -{ - unsigned long data = kvm_mmio_read_buf(val, len); - - switch (len) { - case 1: - return data; - case 2: - return le16_to_cpu(data); - case 4: - return le32_to_cpu(data); - default: - return le64_to_cpu(data); - } -} - -/* - * kvm_mmio_write_buf() expects a value in a format such that if converted to - * a byte array it is observed as the guest would see it if it could perform - * the load directly. Since the GIC is LE, and the guest knows this, the - * guest expects a value in little endian format. - * - * We convert the data value from the CPUs native format to LE so that the - * value is returned in the proper format. - */ -void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, - unsigned long data) -{ - switch (len) { - case 1: - break; - case 2: - data = cpu_to_le16(data); - break; - case 4: - data = cpu_to_le32(data); - break; - default: - data = cpu_to_le64(data); - } - - kvm_mmio_write_buf(buf, len, data); -} - -static -struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev) -{ - return container_of(dev, struct vgic_io_device, dev); -} - -static bool check_region(const struct kvm *kvm, - const struct vgic_register_region *region, - gpa_t addr, int len) -{ - int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - - switch (len) { - case sizeof(u8): - flags = VGIC_ACCESS_8bit; - break; - case sizeof(u32): - flags = VGIC_ACCESS_32bit; - break; - case sizeof(u64): - flags = VGIC_ACCESS_64bit; - break; - default: - return false; - } - - if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) { - if (!region->bits_per_irq) - return true; - - /* Do we access a non-allocated IRQ? */ - return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs; - } - - return false; -} - -const struct vgic_register_region * -vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, - gpa_t addr, int len) -{ - const struct vgic_register_region *region; - - region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, - addr - iodev->base_addr); - if (!region || !check_region(vcpu->kvm, region, addr, len)) - return NULL; - - return region; -} - -static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, u32 *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - struct kvm_vcpu *r_vcpu; - - region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); - if (!region) { - *val = 0; - return 0; - } - - r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; - if (region->uaccess_read) - *val = region->uaccess_read(r_vcpu, addr, sizeof(u32)); - else - *val = region->read(r_vcpu, addr, sizeof(u32)); - - return 0; -} - -static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, const u32 *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - struct kvm_vcpu *r_vcpu; - - region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); - if (!region) - return 0; - - r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; - if (region->uaccess_write) - return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); - - region->write(r_vcpu, addr, sizeof(u32), *val); - return 0; -} - -/* - * Userland access to VGIC registers. - */ -int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, - bool is_write, int offset, u32 *val) -{ - if (is_write) - return vgic_uaccess_write(vcpu, &dev->dev, offset, val); - else - return vgic_uaccess_read(vcpu, &dev->dev, offset, val); -} - -static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, int len, void *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - unsigned long data = 0; - - region = vgic_get_mmio_region(vcpu, iodev, addr, len); - if (!region) { - memset(val, 0, len); - return 0; - } - - switch (iodev->iodev_type) { - case IODEV_CPUIF: - data = region->read(vcpu, addr, len); - break; - case IODEV_DIST: - data = region->read(vcpu, addr, len); - break; - case IODEV_REDIST: - data = region->read(iodev->redist_vcpu, addr, len); - break; - case IODEV_ITS: - data = region->its_read(vcpu->kvm, iodev->its, addr, len); - break; - } - - vgic_data_host_to_mmio_bus(val, len, data); - return 0; -} - -static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, int len, const void *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - unsigned long data = vgic_data_mmio_bus_to_host(val, len); - - region = vgic_get_mmio_region(vcpu, iodev, addr, len); - if (!region) - return 0; - - switch (iodev->iodev_type) { - case IODEV_CPUIF: - region->write(vcpu, addr, len, data); - break; - case IODEV_DIST: - region->write(vcpu, addr, len, data); - break; - case IODEV_REDIST: - region->write(iodev->redist_vcpu, addr, len, data); - break; - case IODEV_ITS: - region->its_write(vcpu->kvm, iodev->its, addr, len, data); - break; - } - - return 0; -} - -struct kvm_io_device_ops kvm_io_gic_ops = { - .read = dispatch_mmio_read, - .write = dispatch_mmio_write, -}; - -int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, - enum vgic_type type) -{ - struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; - int ret = 0; - unsigned int len; - - switch (type) { - case VGIC_V2: - len = vgic_v2_init_dist_iodev(io_device); - break; - case VGIC_V3: - len = vgic_v3_init_dist_iodev(io_device); - break; - default: - BUG_ON(1); - } - - io_device->base_addr = dist_base_address; - io_device->iodev_type = IODEV_DIST; - io_device->redist_vcpu = NULL; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, - len, &io_device->dev); - mutex_unlock(&kvm->slots_lock); - - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h deleted file mode 100644 index a07f90acdaec..000000000000 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#ifndef __KVM_ARM_VGIC_MMIO_H__ -#define __KVM_ARM_VGIC_MMIO_H__ - -struct vgic_register_region { - unsigned int reg_offset; - unsigned int len; - unsigned int bits_per_irq; - unsigned int access_flags; - union { - unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len); - unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len); - }; - union { - void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - void (*its_write)(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val); - }; - unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len); - union { - int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val); - }; -}; - -extern struct kvm_io_device_ops kvm_io_gic_ops; - -#define VGIC_ACCESS_8bit 1 -#define VGIC_ACCESS_32bit 2 -#define VGIC_ACCESS_64bit 4 - -/* - * Generate a mask that covers the number of bytes required to address - * up to 1024 interrupts, each represented by <bits> bits. This assumes - * that <bits> is a power of two. - */ -#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1) - -/* - * (addr & mask) gives us the _byte_ offset for the INT ID. - * We multiply this by 8 the get the _bit_ offset, then divide this by - * the number of bits to learn the actual INT ID. - * But instead of a division (which requires a "long long div" implementation), - * we shift by the binary logarithm of <bits>. - * This assumes that <bits> is a power of two. - */ -#define VGIC_ADDR_TO_INTID(addr, bits) (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \ - 8 >> ilog2(bits)) - -/* - * Some VGIC registers store per-IRQ information, with a different number - * of bits per IRQ. For those registers this macro is used. - * The _WITH_LENGTH version instantiates registers with a fixed length - * and is mutually exclusive with the _PER_IRQ version. - */ -#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = bpi, \ - .len = bpi * 1024 / 8, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - .uaccess_read = ur, \ - .uaccess_write = uw, \ - } - -#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = 0, \ - .len = length, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - } - -#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = 0, \ - .len = length, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - .uaccess_read = urd, \ - .uaccess_write = uwr, \ - } - -int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu, - struct vgic_register_region *reg_desc, - struct vgic_io_device *region, - int nr_irqs, bool offset_private); - -unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); - -void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, - unsigned long data); - -unsigned long extract_bytes(u64 data, unsigned int offset, - unsigned int num); - -u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - -int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - -unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len); - -void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - -unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, - bool is_write, int offset, u32 *val); - -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); - -void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val); - -unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); - -unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); - -u64 vgic_sanitise_outer_cacheability(u64 reg); -u64 vgic_sanitise_inner_cacheability(u64 reg); -u64 vgic_sanitise_shareability(u64 reg); -u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, - u64 (*sanitise_fn)(u64)); - -/* Find the proper register handler entry given a certain address offset */ -const struct vgic_register_region * -vgic_find_mmio_region(const struct vgic_register_region *regions, - int nr_regions, unsigned int offset); - -#endif diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c deleted file mode 100644 index 69b892abd7dc..000000000000 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ /dev/null @@ -1,504 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/irqchip/arm-gic.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <kvm/arm_vgic.h> -#include <asm/kvm_mmu.h> - -#include "vgic.h" - -static inline void vgic_v2_write_lr(int lr, u32 val) -{ - void __iomem *base = kvm_vgic_global_state.vctrl_base; - - writel_relaxed(val, base + GICH_LR0 + (lr * 4)); -} - -void vgic_v2_init_lrs(void) -{ - int i; - - for (i = 0; i < kvm_vgic_global_state.nr_lr; i++) - vgic_v2_write_lr(i, 0); -} - -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - - cpuif->vgic_hcr |= GICH_HCR_UIE; -} - -static bool lr_signals_eoi_mi(u32 lr_val) -{ - return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) && - !(lr_val & GICH_LR_HW); -} - -/* - * transfer the content of the LRs back into the corresponding ap_list: - * - active bit is transferred as is - * - pending bit is - * - transferred as is in case of edge sensitive IRQs - * - set to the line-level (resample time) for level sensitive IRQs - */ -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; - int lr; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - - cpuif->vgic_hcr &= ~GICH_HCR_UIE; - - for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { - u32 val = cpuif->vgic_lr[lr]; - u32 cpuid, intid = val & GICH_LR_VIRTUALID; - struct vgic_irq *irq; - - /* Extract the source vCPU id from the LR */ - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - cpuid &= 7; - - /* Notify fds when the guest EOI'ed a level-triggered SPI */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - - spin_lock(&irq->irq_lock); - - /* Always preserve the active bit */ - irq->active = !!(val & GICH_LR_ACTIVE_BIT); - - if (irq->active && vgic_irq_is_sgi(intid)) - irq->active_source = cpuid; - - /* Edge is the only case where we preserve the pending bit */ - if (irq->config == VGIC_CONFIG_EDGE && - (val & GICH_LR_PENDING_BIT)) { - irq->pending_latch = true; - - if (vgic_irq_is_sgi(intid)) - irq->source |= (1 << cpuid); - } - - /* - * Clear soft pending state when level irqs have been acked. - */ - if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) - irq->pending_latch = false; - - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - */ - if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) { - irq->line_level = vgic_get_phys_line_level(irq); - - if (!irq->line_level) - vgic_irq_set_phys_active(irq, false); - } - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } - - vgic_cpu->used_lrs = 0; -} - -/* - * Populates the particular LR with the state of a given IRQ: - * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq - * - for a level sensitive IRQ the pending state value is unchanged; - * it is dictated directly by the input level - * - * If @irq describes an SGI with multiple sources, we choose the - * lowest-numbered source VCPU and clear that bit in the source bitmap. - * - * The irq_lock must be held by the caller. - */ -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) -{ - u32 val = irq->intid; - bool allow_pending = true; - - if (irq->active) { - val |= GICH_LR_ACTIVE_BIT; - if (vgic_irq_is_sgi(irq->intid)) - val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; - if (vgic_irq_is_multi_sgi(irq)) { - allow_pending = false; - val |= GICH_LR_EOI; - } - } - - if (irq->group) - val |= GICH_LR_GROUP1; - - if (irq->hw) { - val |= GICH_LR_HW; - val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; - /* - * Never set pending+active on a HW interrupt, as the - * pending state is kept at the physical distributor - * level. - */ - if (irq->active) - allow_pending = false; - } else { - if (irq->config == VGIC_CONFIG_LEVEL) { - val |= GICH_LR_EOI; - - /* - * Software resampling doesn't work very well - * if we allow P+A, so let's not do that. - */ - if (irq->active) - allow_pending = false; - } - } - - if (allow_pending && irq_is_pending(irq)) { - val |= GICH_LR_PENDING_BIT; - - if (irq->config == VGIC_CONFIG_EDGE) - irq->pending_latch = false; - - if (vgic_irq_is_sgi(irq->intid)) { - u32 src = ffs(irq->source); - - BUG_ON(!src); - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { - irq->pending_latch = true; - val |= GICH_LR_EOI; - } - } - } - - /* - * Level-triggered mapped IRQs are special because we only observe - * rising edges as input to the VGIC. We therefore lower the line - * level here, so that we can take new virtual IRQs. See - * vgic_v2_fold_lr_state for more info. - */ - if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) - irq->line_level = false; - - /* The GICv2 LR only holds five bits of priority. */ - val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; -} - -void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0; -} - -void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - u32 vmcr; - - vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) & - GICH_VMCR_ENABLE_GRP0_MASK; - vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) & - GICH_VMCR_ENABLE_GRP1_MASK; - vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) & - GICH_VMCR_ACK_CTL_MASK; - vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) & - GICH_VMCR_FIQ_EN_MASK; - vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) & - GICH_VMCR_CBPR_MASK; - vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) & - GICH_VMCR_EOI_MODE_MASK; - vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & - GICH_VMCR_ALIAS_BINPOINT_MASK; - vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & - GICH_VMCR_BINPOINT_MASK; - vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) << - GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; - - cpu_if->vgic_vmcr = vmcr; -} - -void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - u32 vmcr; - - vmcr = cpu_if->vgic_vmcr; - - vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >> - GICH_VMCR_ENABLE_GRP0_SHIFT; - vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >> - GICH_VMCR_ENABLE_GRP1_SHIFT; - vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >> - GICH_VMCR_ACK_CTL_SHIFT; - vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >> - GICH_VMCR_FIQ_EN_SHIFT; - vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >> - GICH_VMCR_CBPR_SHIFT; - vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >> - GICH_VMCR_EOI_MODE_SHIFT; - - vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> - GICH_VMCR_ALIAS_BINPOINT_SHIFT; - vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> - GICH_VMCR_BINPOINT_SHIFT; - vmcrp->pmr = ((vmcr & GICH_VMCR_PRIMASK_MASK) >> - GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; -} - -void vgic_v2_enable(struct kvm_vcpu *vcpu) -{ - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; - - /* Get the show on the road... */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; -} - -/* check for overlapping regions and for regions crossing the end of memory */ -static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) -{ - if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base) - return false; - if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base) - return false; - - if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base) - return true; - if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base) - return true; - - return false; -} - -int vgic_v2_map_resources(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int ret = 0; - - if (vgic_ready(kvm)) - goto out; - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || - IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { - kvm_err("Need to set vgic cpu and dist addresses first\n"); - ret = -ENXIO; - goto out; - } - - if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) { - kvm_err("VGIC CPU and dist frames overlap\n"); - ret = -EINVAL; - goto out; - } - - /* - * Initialize the vgic if this hasn't already been done on demand by - * accessing the vgic state from userspace. - */ - ret = vgic_init(kvm); - if (ret) { - kvm_err("Unable to initialize VGIC dynamic data structures\n"); - goto out; - } - - ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2); - if (ret) { - kvm_err("Unable to register VGIC MMIO regions\n"); - goto out; - } - - if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { - ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, - kvm_vgic_global_state.vcpu_base, - KVM_VGIC_V2_CPU_SIZE, true); - if (ret) { - kvm_err("Unable to remap VGIC CPU to VCPU\n"); - goto out; - } - } - - dist->ready = true; - -out: - return ret; -} - -DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap); - -/** - * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT - * @node: pointer to the DT node - * - * Returns 0 if a GICv2 has been found, returns an error code otherwise - */ -int vgic_v2_probe(const struct gic_kvm_info *info) -{ - int ret; - u32 vtr; - - if (!info->vctrl.start) { - kvm_err("GICH not present in the firmware table\n"); - return -ENXIO; - } - - if (!PAGE_ALIGNED(info->vcpu.start) || - !PAGE_ALIGNED(resource_size(&info->vcpu))) { - kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n"); - - ret = create_hyp_io_mappings(info->vcpu.start, - resource_size(&info->vcpu), - &kvm_vgic_global_state.vcpu_base_va, - &kvm_vgic_global_state.vcpu_hyp_va); - if (ret) { - kvm_err("Cannot map GICV into hyp\n"); - goto out; - } - - static_branch_enable(&vgic_v2_cpuif_trap); - } - - ret = create_hyp_io_mappings(info->vctrl.start, - resource_size(&info->vctrl), - &kvm_vgic_global_state.vctrl_base, - &kvm_vgic_global_state.vctrl_hyp); - if (ret) { - kvm_err("Cannot map VCTRL into hyp\n"); - goto out; - } - - vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); - kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; - - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); - if (ret) { - kvm_err("Cannot register GICv2 KVM device\n"); - goto out; - } - - kvm_vgic_global_state.can_emulate_gicv2 = true; - kvm_vgic_global_state.vcpu_base = info->vcpu.start; - kvm_vgic_global_state.type = VGIC_V2; - kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; - - kvm_debug("vgic-v2@%llx\n", info->vctrl.start); - - return 0; -out: - if (kvm_vgic_global_state.vctrl_base) - iounmap(kvm_vgic_global_state.vctrl_base); - if (kvm_vgic_global_state.vcpu_base_va) - iounmap(kvm_vgic_global_state.vcpu_base_va); - - return ret; -} - -static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - u64 elrsr; - int i; - - elrsr = readl_relaxed(base + GICH_ELRSR0); - if (unlikely(used_lrs > 32)) - elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32; - - for (i = 0; i < used_lrs; i++) { - if (elrsr & (1UL << i)) - cpu_if->vgic_lr[i] &= ~GICH_LR_STATE; - else - cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4)); - - writel_relaxed(0, base + GICH_LR0 + (i * 4)); - } -} - -void vgic_v2_save_state(struct kvm_vcpu *vcpu) -{ - void __iomem *base = kvm_vgic_global_state.vctrl_base; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - - if (!base) - return; - - if (used_lrs) { - save_lrs(vcpu, base); - writel_relaxed(0, base + GICH_HCR); - } -} - -void vgic_v2_restore_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - void __iomem *base = kvm_vgic_global_state.vctrl_base; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - int i; - - if (!base) - return; - - if (used_lrs) { - writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); - for (i = 0; i < used_lrs; i++) { - writel_relaxed(cpu_if->vgic_lr[i], - base + GICH_LR0 + (i * 4)); - } - } -} - -void vgic_v2_load(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - - writel_relaxed(cpu_if->vgic_vmcr, - kvm_vgic_global_state.vctrl_base + GICH_VMCR); - writel_relaxed(cpu_if->vgic_apr, - kvm_vgic_global_state.vctrl_base + GICH_APR); -} - -void vgic_v2_put(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - - cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); - cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); -} diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c deleted file mode 100644 index 9c0dd234ebe8..000000000000 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/irqchip/arm-gic-v3.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <kvm/arm_vgic.h> -#include <asm/kvm_hyp.h> -#include <asm/kvm_mmu.h> -#include <asm/kvm_asm.h> - -#include "vgic.h" - -static bool group0_trap; -static bool group1_trap; -static bool common_trap; -static bool gicv4_enable; - -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - - cpuif->vgic_hcr |= ICH_HCR_UIE; -} - -static bool lr_signals_eoi_mi(u64 lr_val) -{ - return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) && - !(lr_val & ICH_LR_HW); -} - -void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - int lr; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - - cpuif->vgic_hcr &= ~ICH_HCR_UIE; - - for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { - u64 val = cpuif->vgic_lr[lr]; - u32 intid, cpuid; - struct vgic_irq *irq; - bool is_v2_sgi = false; - - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { - intid = val & ICH_LR_VIRTUAL_ID_MASK; - } else { - intid = val & GICH_LR_VIRTUALID; - is_v2_sgi = vgic_irq_is_sgi(intid); - } - - /* Notify fds when the guest EOI'ed a level-triggered IRQ */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - if (!irq) /* An LPI could have been unmapped. */ - continue; - - spin_lock(&irq->irq_lock); - - /* Always preserve the active bit */ - irq->active = !!(val & ICH_LR_ACTIVE_BIT); - - if (irq->active && is_v2_sgi) - irq->active_source = cpuid; - - /* Edge is the only case where we preserve the pending bit */ - if (irq->config == VGIC_CONFIG_EDGE && - (val & ICH_LR_PENDING_BIT)) { - irq->pending_latch = true; - - if (is_v2_sgi) - irq->source |= (1 << cpuid); - } - - /* - * Clear soft pending state when level irqs have been acked. - */ - if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) - irq->pending_latch = false; - - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - */ - if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) { - irq->line_level = vgic_get_phys_line_level(irq); - - if (!irq->line_level) - vgic_irq_set_phys_active(irq, false); - } - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } - - vgic_cpu->used_lrs = 0; -} - -/* Requires the irq to be locked already */ -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) -{ - u32 model = vcpu->kvm->arch.vgic.vgic_model; - u64 val = irq->intid; - bool allow_pending = true, is_v2_sgi; - - is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && - model == KVM_DEV_TYPE_ARM_VGIC_V2); - - if (irq->active) { - val |= ICH_LR_ACTIVE_BIT; - if (is_v2_sgi) - val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; - if (vgic_irq_is_multi_sgi(irq)) { - allow_pending = false; - val |= ICH_LR_EOI; - } - } - - if (irq->hw) { - val |= ICH_LR_HW; - val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; - /* - * Never set pending+active on a HW interrupt, as the - * pending state is kept at the physical distributor - * level. - */ - if (irq->active) - allow_pending = false; - } else { - if (irq->config == VGIC_CONFIG_LEVEL) { - val |= ICH_LR_EOI; - - /* - * Software resampling doesn't work very well - * if we allow P+A, so let's not do that. - */ - if (irq->active) - allow_pending = false; - } - } - - if (allow_pending && irq_is_pending(irq)) { - val |= ICH_LR_PENDING_BIT; - - if (irq->config == VGIC_CONFIG_EDGE) - irq->pending_latch = false; - - if (vgic_irq_is_sgi(irq->intid) && - model == KVM_DEV_TYPE_ARM_VGIC_V2) { - u32 src = ffs(irq->source); - - BUG_ON(!src); - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { - irq->pending_latch = true; - val |= ICH_LR_EOI; - } - } - } - - /* - * Level-triggered mapped IRQs are special because we only observe - * rising edges as input to the VGIC. We therefore lower the line - * level here, so that we can take new virtual IRQs. See - * vgic_v3_fold_lr_state for more info. - */ - if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) - irq->line_level = false; - - if (irq->group) - val |= ICH_LR_GROUP; - - val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; -} - -void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; -} - -void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - u32 vmcr; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { - vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) & - ICH_VMCR_ACK_CTL_MASK; - vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) & - ICH_VMCR_FIQ_EN_MASK; - } else { - /* - * When emulating GICv3 on GICv3 with SRE=1 on the - * VFIQEn bit is RES1 and the VAckCtl bit is RES0. - */ - vmcr = ICH_VMCR_FIQ_EN_MASK; - } - - vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; - vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; - vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; - vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; - vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; - vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; - vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; - - cpu_if->vgic_vmcr = vmcr; -} - -void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - u32 vmcr; - - vmcr = cpu_if->vgic_vmcr; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { - vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >> - ICH_VMCR_ACK_CTL_SHIFT; - vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >> - ICH_VMCR_FIQ_EN_SHIFT; - } else { - /* - * When emulating GICv3 on GICv3 with SRE=1 on the - * VFIQEn bit is RES1 and the VAckCtl bit is RES0. - */ - vmcrp->fiqen = 1; - vmcrp->ackctl = 0; - } - - vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; - vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT; - vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; - vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; - vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; - vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; - vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; -} - -#define INITIAL_PENDBASER_VALUE \ - (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ - GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ - GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) - -void vgic_v3_enable(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; - - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vgic_v3->vgic_vmcr = 0; - - /* - * If we are emulating a GICv3, we do it in an non-GICv2-compatible - * way, so we force SRE to 1 to demonstrate this to the guest. - * Also, we don't support any form of IRQ/FIQ bypass. - * This goes with the spec allowing the value to be RAO/WI. - */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB | - ICC_SRE_EL1_DFB | - ICC_SRE_EL1_SRE); - vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; - } else { - vgic_v3->vgic_sre = 0; - } - - vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_ID_BITS_MASK) >> - ICH_VTR_ID_BITS_SHIFT; - vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_PRI_BITS_MASK) >> - ICH_VTR_PRI_BITS_SHIFT) + 1; - - /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EN; - if (group0_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL0; - if (group1_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL1; - if (common_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TC; -} - -int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) -{ - struct kvm_vcpu *vcpu; - int byte_offset, bit_nr; - gpa_t pendbase, ptr; - bool status; - u8 val; - int ret; - unsigned long flags; - -retry: - vcpu = irq->target_vcpu; - if (!vcpu) - return 0; - - pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); - - byte_offset = irq->intid / BITS_PER_BYTE; - bit_nr = irq->intid % BITS_PER_BYTE; - ptr = pendbase + byte_offset; - - ret = kvm_read_guest_lock(kvm, ptr, &val, 1); - if (ret) - return ret; - - status = val & (1 << bit_nr); - - spin_lock_irqsave(&irq->irq_lock, flags); - if (irq->target_vcpu != vcpu) { - spin_unlock_irqrestore(&irq->irq_lock, flags); - goto retry; - } - irq->pending_latch = status; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - - if (status) { - /* clear consumed data */ - val &= ~(1 << bit_nr); - ret = kvm_write_guest(kvm, ptr, &val, 1); - if (ret) - return ret; - } - return 0; -} - -/** - * vgic_its_save_pending_tables - Save the pending tables into guest RAM - * kvm lock and all vcpu lock must be held - */ -int vgic_v3_save_pending_tables(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int last_byte_offset = -1; - struct vgic_irq *irq; - int ret; - u8 val; - - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - int byte_offset, bit_nr; - struct kvm_vcpu *vcpu; - gpa_t pendbase, ptr; - bool stored; - - vcpu = irq->target_vcpu; - if (!vcpu) - continue; - - pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); - - byte_offset = irq->intid / BITS_PER_BYTE; - bit_nr = irq->intid % BITS_PER_BYTE; - ptr = pendbase + byte_offset; - - if (byte_offset != last_byte_offset) { - ret = kvm_read_guest_lock(kvm, ptr, &val, 1); - if (ret) - return ret; - last_byte_offset = byte_offset; - } - - stored = val & (1U << bit_nr); - if (stored == irq->pending_latch) - continue; - - if (irq->pending_latch) - val |= 1 << bit_nr; - else - val &= ~(1 << bit_nr); - - ret = kvm_write_guest(kvm, ptr, &val, 1); - if (ret) - return ret; - } - return 0; -} - -/** - * vgic_v3_rdist_overlap - check if a region overlaps with any - * existing redistributor region - * - * @kvm: kvm handle - * @base: base of the region - * @size: size of region - * - * Return: true if there is an overlap - */ -bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size) -{ - struct vgic_dist *d = &kvm->arch.vgic; - struct vgic_redist_region *rdreg; - - list_for_each_entry(rdreg, &d->rd_regions, list) { - if ((base + size > rdreg->base) && - (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg))) - return true; - } - return false; -} - -/* - * Check for overlapping regions and for regions crossing the end of memory - * for base addresses which have already been set. - */ -bool vgic_v3_check_base(struct kvm *kvm) -{ - struct vgic_dist *d = &kvm->arch.vgic; - struct vgic_redist_region *rdreg; - - if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && - d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) - return false; - - list_for_each_entry(rdreg, &d->rd_regions, list) { - if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) < - rdreg->base) - return false; - } - - if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base)) - return true; - - return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base, - KVM_VGIC_V3_DIST_SIZE); -} - -/** - * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one - * which has free space to put a new rdist region. - * - * @rd_regions: redistributor region list head - * - * A redistributor regions maps n redistributors, n = region size / (2 x 64kB). - * Stride between redistributors is 0 and regions are filled in the index order. - * - * Return: the redist region handle, if any, that has space to map a new rdist - * region. - */ -struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions) -{ - struct vgic_redist_region *rdreg; - - list_for_each_entry(rdreg, rd_regions, list) { - if (!vgic_v3_redist_region_full(rdreg)) - return rdreg; - } - return NULL; -} - -struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, - u32 index) -{ - struct list_head *rd_regions = &kvm->arch.vgic.rd_regions; - struct vgic_redist_region *rdreg; - - list_for_each_entry(rdreg, rd_regions, list) { - if (rdreg->index == index) - return rdreg; - } - return NULL; -} - - -int vgic_v3_map_resources(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int ret = 0; - int c; - - if (vgic_ready(kvm)) - goto out; - - kvm_for_each_vcpu(c, vcpu, kvm) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { - kvm_debug("vcpu %d redistributor base not set\n", c); - ret = -ENXIO; - goto out; - } - } - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { - kvm_err("Need to set vgic distributor addresses first\n"); - ret = -ENXIO; - goto out; - } - - if (!vgic_v3_check_base(kvm)) { - kvm_err("VGIC redist and dist frames overlap\n"); - ret = -EINVAL; - goto out; - } - - /* - * For a VGICv3 we require the userland to explicitly initialize - * the VGIC before we need to use it. - */ - if (!vgic_initialized(kvm)) { - ret = -EBUSY; - goto out; - } - - ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3); - if (ret) { - kvm_err("Unable to register VGICv3 dist MMIO regions\n"); - goto out; - } - - dist->ready = true; - -out: - return ret; -} - -DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); - -static int __init early_group0_trap_cfg(char *buf) -{ - return strtobool(buf, &group0_trap); -} -early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); - -static int __init early_group1_trap_cfg(char *buf) -{ - return strtobool(buf, &group1_trap); -} -early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); - -static int __init early_common_trap_cfg(char *buf) -{ - return strtobool(buf, &common_trap); -} -early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); - -static int __init early_gicv4_enable(char *buf) -{ - return strtobool(buf, &gicv4_enable); -} -early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); - -/** - * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT - * @node: pointer to the DT node - * - * Returns 0 if a GICv3 has been found, returns an error code otherwise - */ -int vgic_v3_probe(const struct gic_kvm_info *info) -{ - u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); - int ret; - - /* - * The ListRegs field is 5 bits, but there is a architectural - * maximum of 16 list registers. Just ignore bit 4... - */ - kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; - kvm_vgic_global_state.can_emulate_gicv2 = false; - kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; - - /* GICv4 support? */ - if (info->has_v4) { - kvm_vgic_global_state.has_gicv4 = gicv4_enable; - kvm_info("GICv4 support %sabled\n", - gicv4_enable ? "en" : "dis"); - } - - if (!info->vcpu.start) { - kvm_info("GICv3: no GICV resource entry\n"); - kvm_vgic_global_state.vcpu_base = 0; - } else if (!PAGE_ALIGNED(info->vcpu.start)) { - pr_warn("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)info->vcpu.start); - kvm_vgic_global_state.vcpu_base = 0; - } else { - kvm_vgic_global_state.vcpu_base = info->vcpu.start; - kvm_vgic_global_state.can_emulate_gicv2 = true; - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); - if (ret) { - kvm_err("Cannot register GICv2 KVM device.\n"); - return ret; - } - kvm_info("vgic-v2@%llx\n", info->vcpu.start); - } - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); - if (ret) { - kvm_err("Cannot register GICv3 KVM device.\n"); - kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); - return ret; - } - - if (kvm_vgic_global_state.vcpu_base == 0) - kvm_info("disabling GICv2 emulation\n"); - -#ifdef CONFIG_ARM64 - if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) { - group0_trap = true; - group1_trap = true; - } -#endif - - if (group0_trap || group1_trap || common_trap) { - kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n", - group0_trap ? "G0" : "", - group1_trap ? "G1" : "", - common_trap ? "C" : ""); - static_branch_enable(&vgic_v3_cpuif_trap); - } - - kvm_vgic_global_state.vctrl_base = NULL; - kvm_vgic_global_state.type = VGIC_V3; - kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; - - return 0; -} - -void vgic_v3_load(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - /* - * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen - * is dependent on ICC_SRE_EL1.SRE, and we have to perform the - * VMCR_EL2 save/restore in the world switch. - */ - if (likely(cpu_if->vgic_sre)) - kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); - - kvm_call_hyp(__vgic_v3_restore_aprs, vcpu); - - if (has_vhe()) - __vgic_v3_activate_traps(vcpu); -} - -void vgic_v3_put(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - if (likely(cpu_if->vgic_sre)) - cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr); - - kvm_call_hyp(__vgic_v3_save_aprs, vcpu); - - if (has_vhe()) - __vgic_v3_deactivate_traps(vcpu); -} diff --git a/virt/kvm/arm/vgic/vgic-v4.c b/virt/kvm/arm/vgic/vgic-v4.c deleted file mode 100644 index 1ed5f2286b8e..000000000000 --- a/virt/kvm/arm/vgic/vgic-v4.c +++ /dev/null @@ -1,366 +0,0 @@ -/* - * Copyright (C) 2017 ARM Ltd. - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/irqdomain.h> -#include <linux/kvm_host.h> -#include <linux/irqchip/arm-gic-v3.h> - -#include "vgic.h" - -/* - * How KVM uses GICv4 (insert rude comments here): - * - * The vgic-v4 layer acts as a bridge between several entities: - * - The GICv4 ITS representation offered by the ITS driver - * - VFIO, which is in charge of the PCI endpoint - * - The virtual ITS, which is the only thing the guest sees - * - * The configuration of VLPIs is triggered by a callback from VFIO, - * instructing KVM that a PCI device has been configured to deliver - * MSIs to a vITS. - * - * kvm_vgic_v4_set_forwarding() is thus called with the routing entry, - * and this is used to find the corresponding vITS data structures - * (ITS instance, device, event and irq) using a process that is - * extremely similar to the injection of an MSI. - * - * At this stage, we can link the guest's view of an LPI (uniquely - * identified by the routing entry) and the host irq, using the GICv4 - * driver mapping operation. Should the mapping succeed, we've then - * successfully upgraded the guest's LPI to a VLPI. We can then start - * with updating GICv4's view of the property table and generating an - * INValidation in order to kickstart the delivery of this VLPI to the - * guest directly, without software intervention. Well, almost. - * - * When the PCI endpoint is deconfigured, this operation is reversed - * with VFIO calling kvm_vgic_v4_unset_forwarding(). - * - * Once the VLPI has been mapped, it needs to follow any change the - * guest performs on its LPI through the vITS. For that, a number of - * command handlers have hooks to communicate these changes to the HW: - * - Any invalidation triggers a call to its_prop_update_vlpi() - * - The INT command results in a irq_set_irqchip_state(), which - * generates an INT on the corresponding VLPI. - * - The CLEAR command results in a irq_set_irqchip_state(), which - * generates an CLEAR on the corresponding VLPI. - * - DISCARD translates into an unmap, similar to a call to - * kvm_vgic_v4_unset_forwarding(). - * - MOVI is translated by an update of the existing mapping, changing - * the target vcpu, resulting in a VMOVI being generated. - * - MOVALL is translated by a string of mapping updates (similar to - * the handling of MOVI). MOVALL is horrible. - * - * Note that a DISCARD/MAPTI sequence emitted from the guest without - * reprogramming the PCI endpoint after MAPTI does not result in a - * VLPI being mapped, as there is no callback from VFIO (the guest - * will get the interrupt via the normal SW injection). Fixing this is - * not trivial, and requires some horrible messing with the VFIO - * internals. Not fun. Don't do that. - * - * Then there is the scheduling. Each time a vcpu is about to run on a - * physical CPU, KVM must tell the corresponding redistributor about - * it. And if we've migrated our vcpu from one CPU to another, we must - * tell the ITS (so that the messages reach the right redistributor). - * This is done in two steps: first issue a irq_set_affinity() on the - * irq corresponding to the vcpu, then call its_schedule_vpe(). You - * must be in a non-preemptible context. On exit, another call to - * its_schedule_vpe() tells the redistributor that we're done with the - * vcpu. - * - * Finally, the doorbell handling: Each vcpu is allocated an interrupt - * which will fire each time a VLPI is made pending whilst the vcpu is - * not running. Each time the vcpu gets blocked, the doorbell - * interrupt gets enabled. When the vcpu is unblocked (for whatever - * reason), the doorbell interrupt is disabled. - */ - -#define DB_IRQ_FLAGS (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING) - -static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info) -{ - struct kvm_vcpu *vcpu = info; - - vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true; - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - - return IRQ_HANDLED; -} - -/** - * vgic_v4_init - Initialize the GICv4 data structures - * @kvm: Pointer to the VM being initialized - * - * We may be called each time a vITS is created, or when the - * vgic is initialized. This relies on kvm->lock to be - * held. In both cases, the number of vcpus should now be - * fixed. - */ -int vgic_v4_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int i, nr_vcpus, ret; - - if (!kvm_vgic_global_state.has_gicv4) - return 0; /* Nothing to see here... move along. */ - - if (dist->its_vm.vpes) - return 0; - - nr_vcpus = atomic_read(&kvm->online_vcpus); - - dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes), - GFP_KERNEL); - if (!dist->its_vm.vpes) - return -ENOMEM; - - dist->its_vm.nr_vpes = nr_vcpus; - - kvm_for_each_vcpu(i, vcpu, kvm) - dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - - ret = its_alloc_vcpu_irqs(&dist->its_vm); - if (ret < 0) { - kvm_err("VPE IRQ allocation failure\n"); - kfree(dist->its_vm.vpes); - dist->its_vm.nr_vpes = 0; - dist->its_vm.vpes = NULL; - return ret; - } - - kvm_for_each_vcpu(i, vcpu, kvm) { - int irq = dist->its_vm.vpes[i]->irq; - - /* - * Don't automatically enable the doorbell, as we're - * flipping it back and forth when the vcpu gets - * blocked. Also disable the lazy disabling, as the - * doorbell could kick us out of the guest too - * early... - */ - irq_set_status_flags(irq, DB_IRQ_FLAGS); - ret = request_irq(irq, vgic_v4_doorbell_handler, - 0, "vcpu", vcpu); - if (ret) { - kvm_err("failed to allocate vcpu IRQ%d\n", irq); - /* - * Trick: adjust the number of vpes so we know - * how many to nuke on teardown... - */ - dist->its_vm.nr_vpes = i; - break; - } - } - - if (ret) - vgic_v4_teardown(kvm); - - return ret; -} - -/** - * vgic_v4_teardown - Free the GICv4 data structures - * @kvm: Pointer to the VM being destroyed - * - * Relies on kvm->lock to be held. - */ -void vgic_v4_teardown(struct kvm *kvm) -{ - struct its_vm *its_vm = &kvm->arch.vgic.its_vm; - int i; - - if (!its_vm->vpes) - return; - - for (i = 0; i < its_vm->nr_vpes; i++) { - struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i); - int irq = its_vm->vpes[i]->irq; - - irq_clear_status_flags(irq, DB_IRQ_FLAGS); - free_irq(irq, vcpu); - } - - its_free_vcpu_irqs(its_vm); - kfree(its_vm->vpes); - its_vm->nr_vpes = 0; - its_vm->vpes = NULL; -} - -int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu) -{ - if (!vgic_supports_direct_msis(vcpu->kvm)) - return 0; - - return its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, false); -} - -int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu) -{ - int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq; - int err; - - if (!vgic_supports_direct_msis(vcpu->kvm)) - return 0; - - /* - * Before making the VPE resident, make sure the redistributor - * corresponding to our current CPU expects us here. See the - * doc in drivers/irqchip/irq-gic-v4.c to understand how this - * turns into a VMOVP command at the ITS level. - */ - err = irq_set_affinity(irq, cpumask_of(smp_processor_id())); - if (err) - return err; - - err = its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, true); - if (err) - return err; - - /* - * Now that the VPE is resident, let's get rid of a potential - * doorbell interrupt that would still be pending. - */ - err = irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, false); - - return err; -} - -static struct vgic_its *vgic_get_its(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *irq_entry) -{ - struct kvm_msi msi = (struct kvm_msi) { - .address_lo = irq_entry->msi.address_lo, - .address_hi = irq_entry->msi.address_hi, - .data = irq_entry->msi.data, - .flags = irq_entry->msi.flags, - .devid = irq_entry->msi.devid, - }; - - return vgic_msi_to_its(kvm, &msi); -} - -int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, - struct kvm_kernel_irq_routing_entry *irq_entry) -{ - struct vgic_its *its; - struct vgic_irq *irq; - struct its_vlpi_map map; - int ret; - - if (!vgic_supports_direct_msis(kvm)) - return 0; - - /* - * Get the ITS, and escape early on error (not a valid - * doorbell for any of our vITSs). - */ - its = vgic_get_its(kvm, irq_entry); - if (IS_ERR(its)) - return 0; - - mutex_lock(&its->its_lock); - - /* Perform then actual DevID/EventID -> LPI translation. */ - ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, - irq_entry->msi.data, &irq); - if (ret) - goto out; - - /* - * Emit the mapping request. If it fails, the ITS probably - * isn't v4 compatible, so let's silently bail out. Holding - * the ITS lock should ensure that nothing can modify the - * target vcpu. - */ - map = (struct its_vlpi_map) { - .vm = &kvm->arch.vgic.its_vm, - .vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe, - .vintid = irq->intid, - .properties = ((irq->priority & 0xfc) | - (irq->enabled ? LPI_PROP_ENABLED : 0) | - LPI_PROP_GROUP1), - .db_enabled = true, - }; - - ret = its_map_vlpi(virq, &map); - if (ret) - goto out; - - irq->hw = true; - irq->host_irq = virq; - -out: - mutex_unlock(&its->its_lock); - return ret; -} - -int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq, - struct kvm_kernel_irq_routing_entry *irq_entry) -{ - struct vgic_its *its; - struct vgic_irq *irq; - int ret; - - if (!vgic_supports_direct_msis(kvm)) - return 0; - - /* - * Get the ITS, and escape early on error (not a valid - * doorbell for any of our vITSs). - */ - its = vgic_get_its(kvm, irq_entry); - if (IS_ERR(its)) - return 0; - - mutex_lock(&its->its_lock); - - ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, - irq_entry->msi.data, &irq); - if (ret) - goto out; - - WARN_ON(!(irq->hw && irq->host_irq == virq)); - if (irq->hw) { - irq->hw = false; - ret = its_unmap_vlpi(virq); - } - -out: - mutex_unlock(&its->its_lock); - return ret; -} - -void kvm_vgic_v4_enable_doorbell(struct kvm_vcpu *vcpu) -{ - if (vgic_supports_direct_msis(vcpu->kvm)) { - int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq; - if (irq) - enable_irq(irq); - } -} - -void kvm_vgic_v4_disable_doorbell(struct kvm_vcpu *vcpu) -{ - if (vgic_supports_direct_msis(vcpu->kvm)) { - int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq; - if (irq) - disable_irq(irq); - } -} diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c deleted file mode 100644 index 870b1185173b..000000000000 --- a/virt/kvm/arm/vgic/vgic.c +++ /dev/null @@ -1,973 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/list_sort.h> -#include <linux/nospec.h> - -#include <asm/kvm_hyp.h> - -#include "vgic.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -struct vgic_global kvm_vgic_global_state __ro_after_init = { - .gicv3_cpuif = STATIC_KEY_FALSE_INIT, -}; - -/* - * Locking order is always: - * kvm->lock (mutex) - * its->cmd_lock (mutex) - * its->its_lock (mutex) - * vgic_cpu->ap_list_lock must be taken with IRQs disabled - * kvm->lpi_list_lock must be taken with IRQs disabled - * vgic_irq->irq_lock must be taken with IRQs disabled - * - * As the ap_list_lock might be taken from the timer interrupt handler, - * we have to disable IRQs before taking this lock and everything lower - * than it. - * - * If you need to take multiple locks, always take the upper lock first, - * then the lower ones, e.g. first take the its_lock, then the irq_lock. - * If you are already holding a lock and need to take a higher one, you - * have to drop the lower ranking lock first and re-aquire it after having - * taken the upper one. - * - * When taking more than one ap_list_lock at the same time, always take the - * lowest numbered VCPU's ap_list_lock first, so: - * vcpuX->vcpu_id < vcpuY->vcpu_id: - * spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); - * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); - * - * Since the VGIC must support injecting virtual interrupts from ISRs, we have - * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer - * spinlocks for any lock that may be taken while injecting an interrupt. - */ - -/* - * Iterate over the VM's list of mapped LPIs to find the one with a - * matching interrupt ID and return a reference to the IRQ structure. - */ -static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq = NULL; - unsigned long flags; - - spin_lock_irqsave(&dist->lpi_list_lock, flags); - - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - if (irq->intid != intid) - continue; - - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() later once it's finished with the IRQ. - */ - vgic_get_irq_kref(irq); - goto out_unlock; - } - irq = NULL; - -out_unlock: - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - return irq; -} - -/* - * This looks up the virtual interrupt ID to get the corresponding - * struct vgic_irq. It also increases the refcount, so any caller is expected - * to call vgic_put_irq() once it's finished with this IRQ. - */ -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid) -{ - /* SGIs and PPIs */ - if (intid <= VGIC_MAX_PRIVATE) { - intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); - return &vcpu->arch.vgic_cpu.private_irqs[intid]; - } - - /* SPIs */ - if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { - intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); - return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; - } - - /* LPIs */ - if (intid >= VGIC_MIN_LPI) - return vgic_get_lpi(kvm, intid); - - WARN(1, "Looking up struct vgic_irq for reserved INTID"); - return NULL; -} - -/* - * We can't do anything in here, because we lack the kvm pointer to - * lock and remove the item from the lpi_list. So we keep this function - * empty and use the return value of kref_put() to trigger the freeing. - */ -static void vgic_irq_release(struct kref *ref) -{ -} - -void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long flags; - - if (irq->intid < VGIC_MIN_LPI) - return; - - spin_lock_irqsave(&dist->lpi_list_lock, flags); - if (!kref_put(&irq->refcount, vgic_irq_release)) { - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - return; - }; - - list_del(&irq->lpi_list); - dist->lpi_list_count--; - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - kfree(irq); -} - -void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) -{ - WARN_ON(irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - pending)); -} - -bool vgic_get_phys_line_level(struct vgic_irq *irq) -{ - bool line_level; - - BUG_ON(!irq->hw); - - if (irq->get_input_level) - return irq->get_input_level(irq->intid); - - WARN_ON(irq_get_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - &line_level)); - return line_level; -} - -/* Set/Clear the physical active state */ -void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) -{ - - BUG_ON(!irq->hw); - WARN_ON(irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_ACTIVE, - active)); -} - -/** - * kvm_vgic_target_oracle - compute the target vcpu for an irq - * - * @irq: The irq to route. Must be already locked. - * - * Based on the current state of the interrupt (enabled, pending, - * active, vcpu and target_vcpu), compute the next vcpu this should be - * given to. Return NULL if this shouldn't be injected at all. - * - * Requires the IRQ lock to be held. - */ -static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) -{ - lockdep_assert_held(&irq->irq_lock); - - /* If the interrupt is active, it must stay on the current vcpu */ - if (irq->active) - return irq->vcpu ? : irq->target_vcpu; - - /* - * If the IRQ is not active but enabled and pending, we should direct - * it to its configured target VCPU. - * If the distributor is disabled, pending interrupts shouldn't be - * forwarded. - */ - if (irq->enabled && irq_is_pending(irq)) { - if (unlikely(irq->target_vcpu && - !irq->target_vcpu->kvm->arch.vgic.enabled)) - return NULL; - - return irq->target_vcpu; - } - - /* If neither active nor pending and enabled, then this IRQ should not - * be queued to any VCPU. - */ - return NULL; -} - -/* - * The order of items in the ap_lists defines how we'll pack things in LRs as - * well, the first items in the list being the first things populated in the - * LRs. - * - * A hard rule is that active interrupts can never be pushed out of the LRs - * (and therefore take priority) since we cannot reliably trap on deactivation - * of IRQs and therefore they have to be present in the LRs. - * - * Otherwise things should be sorted by the priority field and the GIC - * hardware support will take care of preemption of priority groups etc. - * - * Return negative if "a" sorts before "b", 0 to preserve order, and positive - * to sort "b" before "a". - */ -static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); - struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); - bool penda, pendb; - int ret; - - spin_lock(&irqa->irq_lock); - spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); - - if (irqa->active || irqb->active) { - ret = (int)irqb->active - (int)irqa->active; - goto out; - } - - penda = irqa->enabled && irq_is_pending(irqa); - pendb = irqb->enabled && irq_is_pending(irqb); - - if (!penda || !pendb) { - ret = (int)pendb - (int)penda; - goto out; - } - - /* Both pending and enabled, sort by priority */ - ret = irqa->priority - irqb->priority; -out: - spin_unlock(&irqb->irq_lock); - spin_unlock(&irqa->irq_lock); - return ret; -} - -/* Must be called with the ap_list_lock held */ -static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - lockdep_assert_held(&vgic_cpu->ap_list_lock); - - list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); -} - -/* - * Only valid injection if changing level for level-triggered IRQs or for a - * rising edge, and in-kernel connected IRQ lines can only be controlled by - * their owner. - */ -static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner) -{ - if (irq->owner != owner) - return false; - - switch (irq->config) { - case VGIC_CONFIG_LEVEL: - return irq->line_level != level; - case VGIC_CONFIG_EDGE: - return level; - } - - return false; -} - -/* - * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. - * Do the queuing if necessary, taking the right locks in the right order. - * Returns true when the IRQ was queued, false otherwise. - * - * Needs to be entered with the IRQ lock already held, but will return - * with all locks dropped. - */ -bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags) -{ - struct kvm_vcpu *vcpu; - - lockdep_assert_held(&irq->irq_lock); - -retry: - vcpu = vgic_target_oracle(irq); - if (irq->vcpu || !vcpu) { - /* - * If this IRQ is already on a VCPU's ap_list, then it - * cannot be moved or modified and there is no more work for - * us to do. - * - * Otherwise, if the irq is not pending and enabled, it does - * not need to be inserted into an ap_list and there is also - * no more work for us to do. - */ - spin_unlock_irqrestore(&irq->irq_lock, flags); - - /* - * We have to kick the VCPU here, because we could be - * queueing an edge-triggered interrupt for which we - * get no EOI maintenance interrupt. In that case, - * while the IRQ is already on the VCPU's AP list, the - * VCPU could have EOI'ed the original interrupt and - * won't see this one until it exits for some other - * reason. - */ - if (vcpu) { - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - } - return false; - } - - /* - * We must unlock the irq lock to take the ap_list_lock where - * we are going to insert this new pending interrupt. - */ - spin_unlock_irqrestore(&irq->irq_lock, flags); - - /* someone can do stuff here, which we re-check below */ - - spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - spin_lock(&irq->irq_lock); - - /* - * Did something change behind our backs? - * - * There are two cases: - * 1) The irq lost its pending state or was disabled behind our - * backs and/or it was queued to another VCPU's ap_list. - * 2) Someone changed the affinity on this irq behind our - * backs and we are now holding the wrong ap_list_lock. - * - * In both cases, drop the locks and retry. - */ - - if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { - spin_unlock(&irq->irq_lock); - spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - - spin_lock_irqsave(&irq->irq_lock, flags); - goto retry; - } - - /* - * Grab a reference to the irq to reflect the fact that it is - * now in the ap_list. - */ - vgic_get_irq_kref(irq); - list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); - irq->vcpu = vcpu; - - spin_unlock(&irq->irq_lock); - spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - - return true; -} - -/** - * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic - * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs - * @intid: The INTID to inject a new state to. - * @level: Edge-triggered: true: to trigger the interrupt - * false: to ignore the call - * Level-sensitive true: raise the input signal - * false: lower the input signal - * @owner: The opaque pointer to the owner of the IRQ being raised to verify - * that the caller is allowed to inject this IRQ. Userspace - * injections will have owner == NULL. - * - * The VGIC is not concerned with devices being active-LOW or active-HIGH for - * level-sensitive interrupts. You can think of the level parameter as 1 - * being HIGH and 0 being LOW and all devices being active-HIGH. - */ -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level, void *owner) -{ - struct kvm_vcpu *vcpu; - struct vgic_irq *irq; - unsigned long flags; - int ret; - - trace_vgic_update_irq_pending(cpuid, intid, level); - - ret = vgic_lazy_init(kvm); - if (ret) - return ret; - - vcpu = kvm_get_vcpu(kvm, cpuid); - if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS) - return -EINVAL; - - irq = vgic_get_irq(kvm, vcpu, intid); - if (!irq) - return -EINVAL; - - spin_lock_irqsave(&irq->irq_lock, flags); - - if (!vgic_validate_injection(irq, level, owner)) { - /* Nothing to see here, move along... */ - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(kvm, irq); - return 0; - } - - if (irq->config == VGIC_CONFIG_LEVEL) - irq->line_level = level; - else - irq->pending_latch = true; - - vgic_queue_irq_unlock(kvm, irq, flags); - vgic_put_irq(kvm, irq); - - return 0; -} - -/* @irq->irq_lock must be held */ -static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - unsigned int host_irq, - bool (*get_input_level)(int vindid)) -{ - struct irq_desc *desc; - struct irq_data *data; - - /* - * Find the physical IRQ number corresponding to @host_irq - */ - desc = irq_to_desc(host_irq); - if (!desc) { - kvm_err("%s: no interrupt descriptor\n", __func__); - return -EINVAL; - } - data = irq_desc_get_irq_data(desc); - while (data->parent_data) - data = data->parent_data; - - irq->hw = true; - irq->host_irq = host_irq; - irq->hwintid = data->hwirq; - irq->get_input_level = get_input_level; - return 0; -} - -/* @irq->irq_lock must be held */ -static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) -{ - irq->hw = false; - irq->hwintid = 0; - irq->get_input_level = NULL; -} - -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, - u32 vintid, bool (*get_input_level)(int vindid)) -{ - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); - unsigned long flags; - int ret; - - BUG_ON(!irq); - - spin_lock_irqsave(&irq->irq_lock, flags); - ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level); - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - - return ret; -} - -/** - * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ - * @vcpu: The VCPU pointer - * @vintid: The INTID of the interrupt - * - * Reset the active and pending states of a mapped interrupt. Kernel - * subsystems injecting mapped interrupts should reset their interrupt lines - * when we are doing a reset of the VM. - */ -void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid) -{ - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); - unsigned long flags; - - if (!irq->hw) - goto out; - - spin_lock_irqsave(&irq->irq_lock, flags); - irq->active = false; - irq->pending_latch = false; - irq->line_level = false; - spin_unlock_irqrestore(&irq->irq_lock, flags); -out: - vgic_put_irq(vcpu->kvm, irq); -} - -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) -{ - struct vgic_irq *irq; - unsigned long flags; - - if (!vgic_initialized(vcpu->kvm)) - return -EAGAIN; - - irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); - BUG_ON(!irq); - - spin_lock_irqsave(&irq->irq_lock, flags); - kvm_vgic_unmap_irq(irq); - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - - return 0; -} - -/** - * kvm_vgic_set_owner - Set the owner of an interrupt for a VM - * - * @vcpu: Pointer to the VCPU (used for PPIs) - * @intid: The virtual INTID identifying the interrupt (PPI or SPI) - * @owner: Opaque pointer to the owner - * - * Returns 0 if intid is not already used by another in-kernel device and the - * owner is set, otherwise returns an error code. - */ -int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) -{ - struct vgic_irq *irq; - unsigned long flags; - int ret = 0; - - if (!vgic_initialized(vcpu->kvm)) - return -EAGAIN; - - /* SGIs and LPIs cannot be wired up to any device */ - if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid)) - return -EINVAL; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - spin_lock_irqsave(&irq->irq_lock, flags); - if (irq->owner && irq->owner != owner) - ret = -EEXIST; - else - irq->owner = owner; - spin_unlock_irqrestore(&irq->irq_lock, flags); - - return ret; -} - -/** - * vgic_prune_ap_list - Remove non-relevant interrupts from the list - * - * @vcpu: The VCPU pointer - * - * Go over the list of "interesting" interrupts, and prune those that we - * won't have to consider in the near future. - */ -static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq, *tmp; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - -retry: - spin_lock(&vgic_cpu->ap_list_lock); - - list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { - struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; - bool target_vcpu_needs_kick = false; - - spin_lock(&irq->irq_lock); - - BUG_ON(vcpu != irq->vcpu); - - target_vcpu = vgic_target_oracle(irq); - - if (!target_vcpu) { - /* - * We don't need to process this interrupt any - * further, move it off the list. - */ - list_del(&irq->ap_list); - irq->vcpu = NULL; - spin_unlock(&irq->irq_lock); - - /* - * This vgic_put_irq call matches the - * vgic_get_irq_kref in vgic_queue_irq_unlock, - * where we added the LPI to the ap_list. As - * we remove the irq from the list, we drop - * also drop the refcount. - */ - vgic_put_irq(vcpu->kvm, irq); - continue; - } - - if (target_vcpu == vcpu) { - /* We're on the right CPU */ - spin_unlock(&irq->irq_lock); - continue; - } - - /* This interrupt looks like it has to be migrated. */ - - spin_unlock(&irq->irq_lock); - spin_unlock(&vgic_cpu->ap_list_lock); - - /* - * Ensure locking order by always locking the smallest - * ID first. - */ - if (vcpu->vcpu_id < target_vcpu->vcpu_id) { - vcpuA = vcpu; - vcpuB = target_vcpu; - } else { - vcpuA = target_vcpu; - vcpuB = vcpu; - } - - spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); - spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, - SINGLE_DEPTH_NESTING); - spin_lock(&irq->irq_lock); - - /* - * If the affinity has been preserved, move the - * interrupt around. Otherwise, it means things have - * changed while the interrupt was unlocked, and we - * need to replay this. - * - * In all cases, we cannot trust the list not to have - * changed, so we restart from the beginning. - */ - if (target_vcpu == vgic_target_oracle(irq)) { - struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu; - - list_del(&irq->ap_list); - irq->vcpu = target_vcpu; - list_add_tail(&irq->ap_list, &new_cpu->ap_list_head); - target_vcpu_needs_kick = true; - } - - spin_unlock(&irq->irq_lock); - spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); - spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); - - if (target_vcpu_needs_kick) { - kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu); - kvm_vcpu_kick(target_vcpu); - } - - goto retry; - } - - spin_unlock(&vgic_cpu->ap_list_lock); -} - -static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_fold_lr_state(vcpu); - else - vgic_v3_fold_lr_state(vcpu); -} - -/* Requires the irq_lock to be held. */ -static inline void vgic_populate_lr(struct kvm_vcpu *vcpu, - struct vgic_irq *irq, int lr) -{ - lockdep_assert_held(&irq->irq_lock); - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_populate_lr(vcpu, irq, lr); - else - vgic_v3_populate_lr(vcpu, irq, lr); -} - -static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_clear_lr(vcpu, lr); - else - vgic_v3_clear_lr(vcpu, lr); -} - -static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_underflow(vcpu); - else - vgic_v3_set_underflow(vcpu); -} - -/* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu, - bool *multi_sgi) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq; - int count = 0; - - *multi_sgi = false; - - lockdep_assert_held(&vgic_cpu->ap_list_lock); - - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - int w; - - spin_lock(&irq->irq_lock); - /* GICv2 SGIs can count for more than one... */ - w = vgic_irq_get_lr_count(irq); - spin_unlock(&irq->irq_lock); - - count += w; - *multi_sgi |= (w > 1); - } - return count; -} - -/* Requires the VCPU's ap_list_lock to be held. */ -static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq; - int count; - bool multi_sgi; - u8 prio = 0xff; - - lockdep_assert_held(&vgic_cpu->ap_list_lock); - - count = compute_ap_list_depth(vcpu, &multi_sgi); - if (count > kvm_vgic_global_state.nr_lr || multi_sgi) - vgic_sort_ap_list(vcpu); - - count = 0; - - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - spin_lock(&irq->irq_lock); - - /* - * If we have multi-SGIs in the pipeline, we need to - * guarantee that they are all seen before any IRQ of - * lower priority. In that case, we need to filter out - * these interrupts by exiting early. This is easy as - * the AP list has been sorted already. - */ - if (multi_sgi && irq->priority > prio) { - spin_unlock(&irq->irq_lock); - break; - } - - if (likely(vgic_target_oracle(irq) == vcpu)) { - vgic_populate_lr(vcpu, irq, count++); - - if (irq->source) - prio = irq->priority; - } - - spin_unlock(&irq->irq_lock); - - if (count == kvm_vgic_global_state.nr_lr) { - if (!list_is_last(&irq->ap_list, - &vgic_cpu->ap_list_head)) - vgic_set_underflow(vcpu); - break; - } - } - - vcpu->arch.vgic_cpu.used_lrs = count; - - /* Nuke remaining LRs */ - for ( ; count < kvm_vgic_global_state.nr_lr; count++) - vgic_clear_lr(vcpu, count); -} - -static inline bool can_access_vgic_from_kernel(void) -{ - /* - * GICv2 can always be accessed from the kernel because it is - * memory-mapped, and VHE systems can access GICv3 EL2 system - * registers. - */ - return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe(); -} - -static inline void vgic_save_state(struct kvm_vcpu *vcpu) -{ - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - vgic_v2_save_state(vcpu); - else - __vgic_v3_save_state(vcpu); -} - -/* Sync back the hardware VGIC state into our emulation after a guest's run. */ -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - WARN_ON(vgic_v4_sync_hwstate(vcpu)); - - /* An empty ap_list_head implies used_lrs == 0 */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) - return; - - if (can_access_vgic_from_kernel()) - vgic_save_state(vcpu); - - if (vgic_cpu->used_lrs) - vgic_fold_lr_state(vcpu); - vgic_prune_ap_list(vcpu); -} - -static inline void vgic_restore_state(struct kvm_vcpu *vcpu) -{ - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - vgic_v2_restore_state(vcpu); - else - __vgic_v3_restore_state(vcpu); -} - -/* Flush our emulation state into the GIC hardware before entering the guest. */ -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) -{ - WARN_ON(vgic_v4_flush_hwstate(vcpu)); - - /* - * If there are no virtual interrupts active or pending for this - * VCPU, then there is no work to do and we can bail out without - * taking any lock. There is a potential race with someone injecting - * interrupts to the VCPU, but it is a benign race as the VCPU will - * either observe the new interrupt before or after doing this check, - * and introducing additional synchronization mechanism doesn't change - * this. - */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) - return; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - - spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); - vgic_flush_lr_state(vcpu); - spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); - - if (can_access_vgic_from_kernel()) - vgic_restore_state(vcpu); -} - -void kvm_vgic_load(struct kvm_vcpu *vcpu) -{ - if (unlikely(!vgic_initialized(vcpu->kvm))) - return; - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_load(vcpu); - else - vgic_v3_load(vcpu); -} - -void kvm_vgic_put(struct kvm_vcpu *vcpu) -{ - if (unlikely(!vgic_initialized(vcpu->kvm))) - return; - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_put(vcpu); - else - vgic_v3_put(vcpu); -} - -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq; - bool pending = false; - unsigned long flags; - struct vgic_vmcr vmcr; - - if (!vcpu->kvm->arch.vgic.enabled) - return false; - - if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) - return true; - - vgic_get_vmcr(vcpu, &vmcr); - - spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); - - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - spin_lock(&irq->irq_lock); - pending = irq_is_pending(irq) && irq->enabled && - !irq->active && - irq->priority < vmcr.pmr; - spin_unlock(&irq->irq_lock); - - if (pending) - break; - } - - spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); - - return pending; -} - -void vgic_kick_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int c; - - /* - * We've injected an interrupt, time to find out who deserves - * a good kick... - */ - kvm_for_each_vcpu(c, vcpu, kvm) { - if (kvm_vgic_vcpu_pending_irq(vcpu)) { - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); - } - } -} - -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) -{ - struct vgic_irq *irq; - bool map_is_active; - unsigned long flags; - - if (!vgic_initialized(vcpu->kvm)) - return false; - - irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); - spin_lock_irqsave(&irq->irq_lock, flags); - map_is_active = irq->hw && irq->active; - spin_unlock_irqrestore(&irq->irq_lock, flags); - vgic_put_irq(vcpu->kvm, irq); - - return map_is_active; -} - diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h deleted file mode 100644 index a90024718ca4..000000000000 --- a/virt/kvm/arm/vgic/vgic.h +++ /dev/null @@ -1,325 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#ifndef __KVM_ARM_VGIC_NEW_H__ -#define __KVM_ARM_VGIC_NEW_H__ - -#include <linux/irqchip/arm-gic-common.h> - -#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ -#define IMPLEMENTER_ARM 0x43b - -#define VGIC_ADDR_UNDEF (-1) -#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) - -#define INTERRUPT_ID_BITS_SPIS 10 -#define INTERRUPT_ID_BITS_ITS 16 -#define VGIC_PRI_BITS 5 - -#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) - -#define VGIC_AFFINITY_0_SHIFT 0 -#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT) -#define VGIC_AFFINITY_1_SHIFT 8 -#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT) -#define VGIC_AFFINITY_2_SHIFT 16 -#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT) -#define VGIC_AFFINITY_3_SHIFT 24 -#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT) - -#define VGIC_AFFINITY_LEVEL(reg, level) \ - ((((reg) & VGIC_AFFINITY_## level ##_MASK) \ - >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) - -/* - * The Userspace encodes the affinity differently from the MPIDR, - * Below macro converts vgic userspace format to MPIDR reg format. - */ -#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \ - VGIC_AFFINITY_LEVEL(val, 1) | \ - VGIC_AFFINITY_LEVEL(val, 2) | \ - VGIC_AFFINITY_LEVEL(val, 3)) - -/* - * As per Documentation/virtual/kvm/devices/arm-vgic-v3.txt, - * below macros are defined for CPUREG encoding. - */ -#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 -#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14 -#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800 -#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11 -#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780 -#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7 -#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078 -#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3 -#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007 -#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0 - -#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \ - KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \ - KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \ - KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \ - KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) - -/* - * As per Documentation/virtual/kvm/devices/arm-vgic-its.txt, - * below macros are defined for ITS table entry encoding. - */ -#define KVM_ITS_CTE_VALID_SHIFT 63 -#define KVM_ITS_CTE_VALID_MASK BIT_ULL(63) -#define KVM_ITS_CTE_RDBASE_SHIFT 16 -#define KVM_ITS_CTE_ICID_MASK GENMASK_ULL(15, 0) -#define KVM_ITS_ITE_NEXT_SHIFT 48 -#define KVM_ITS_ITE_PINTID_SHIFT 16 -#define KVM_ITS_ITE_PINTID_MASK GENMASK_ULL(47, 16) -#define KVM_ITS_ITE_ICID_MASK GENMASK_ULL(15, 0) -#define KVM_ITS_DTE_VALID_SHIFT 63 -#define KVM_ITS_DTE_VALID_MASK BIT_ULL(63) -#define KVM_ITS_DTE_NEXT_SHIFT 49 -#define KVM_ITS_DTE_NEXT_MASK GENMASK_ULL(62, 49) -#define KVM_ITS_DTE_ITTADDR_SHIFT 5 -#define KVM_ITS_DTE_ITTADDR_MASK GENMASK_ULL(48, 5) -#define KVM_ITS_DTE_SIZE_MASK GENMASK_ULL(4, 0) -#define KVM_ITS_L1E_VALID_MASK BIT_ULL(63) -/* we only support 64 kB translation table page size */ -#define KVM_ITS_L1E_ADDR_MASK GENMASK_ULL(51, 16) - -#define KVM_VGIC_V3_RDIST_INDEX_MASK GENMASK_ULL(11, 0) -#define KVM_VGIC_V3_RDIST_FLAGS_MASK GENMASK_ULL(15, 12) -#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT 12 -#define KVM_VGIC_V3_RDIST_BASE_MASK GENMASK_ULL(51, 16) -#define KVM_VGIC_V3_RDIST_COUNT_MASK GENMASK_ULL(63, 52) -#define KVM_VGIC_V3_RDIST_COUNT_SHIFT 52 - -#ifdef CONFIG_DEBUG_SPINLOCK -#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) -#else -#define DEBUG_SPINLOCK_BUG_ON(p) -#endif - -/* Requires the irq_lock to be held by the caller. */ -static inline bool irq_is_pending(struct vgic_irq *irq) -{ - if (irq->config == VGIC_CONFIG_EDGE) - return irq->pending_latch; - else - return irq->pending_latch || irq->line_level; -} - -static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq) -{ - return irq->config == VGIC_CONFIG_LEVEL && irq->hw; -} - -static inline int vgic_irq_get_lr_count(struct vgic_irq *irq) -{ - /* Account for the active state as an interrupt */ - if (vgic_irq_is_sgi(irq->intid) && irq->source) - return hweight8(irq->source) + irq->active; - - return irq_is_pending(irq) || irq->active; -} - -static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) -{ - return vgic_irq_get_lr_count(irq) > 1; -} - -/* - * This struct provides an intermediate representation of the fields contained - * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC - * state to userspace can generate either GICv2 or GICv3 CPU interface - * registers regardless of the hardware backed GIC used. - */ -struct vgic_vmcr { - u32 grpen0; - u32 grpen1; - - u32 ackctl; - u32 fiqen; - u32 cbpr; - u32 eoim; - - u32 abpr; - u32 bpr; - u32 pmr; /* Priority mask field in the GICC_PMR and - * ICC_PMR_EL1 priority field format */ -}; - -struct vgic_reg_attr { - struct kvm_vcpu *vcpu; - gpa_t addr; -}; - -int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr); -int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr); -const struct vgic_register_region * -vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, - gpa_t addr, int len); -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid); -void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); -bool vgic_get_phys_line_level(struct vgic_irq *irq); -void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); -void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); -bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags); -void vgic_kick_vcpus(struct kvm *kvm); - -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment); - -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); -void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); -void vgic_v2_set_npie(struct kvm_vcpu *vcpu); -int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); -int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_enable(struct kvm_vcpu *vcpu); -int vgic_v2_probe(const struct gic_kvm_info *info); -int vgic_v2_map_resources(struct kvm *kvm); -int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, - enum vgic_type); - -void vgic_v2_init_lrs(void); -void vgic_v2_load(struct kvm_vcpu *vcpu); -void vgic_v2_put(struct kvm_vcpu *vcpu); - -void vgic_v2_save_state(struct kvm_vcpu *vcpu); -void vgic_v2_restore_state(struct kvm_vcpu *vcpu); - -static inline void vgic_get_irq_kref(struct vgic_irq *irq) -{ - if (irq->intid < VGIC_MIN_LPI) - return; - - kref_get(&irq->refcount); -} - -void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); -void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); -void vgic_v3_set_npie(struct kvm_vcpu *vcpu); -void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_enable(struct kvm_vcpu *vcpu); -int vgic_v3_probe(const struct gic_kvm_info *info); -int vgic_v3_map_resources(struct kvm *kvm); -int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); -int vgic_v3_save_pending_tables(struct kvm *kvm); -int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count); -int vgic_register_redist_iodev(struct kvm_vcpu *vcpu); -bool vgic_v3_check_base(struct kvm *kvm); - -void vgic_v3_load(struct kvm_vcpu *vcpu); -void vgic_v3_put(struct kvm_vcpu *vcpu); - -bool vgic_has_its(struct kvm *kvm); -int kvm_vgic_register_its_device(void); -void vgic_enable_lpis(struct kvm_vcpu *vcpu); -int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); -int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); -int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u64 id, u64 *val); -int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg); -int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val); -int kvm_register_vgic_device(unsigned long type); -void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -int vgic_lazy_init(struct kvm *kvm); -int vgic_init(struct kvm *kvm); - -void vgic_debug_init(struct kvm *kvm); -void vgic_debug_destroy(struct kvm *kvm); - -bool lock_all_vcpus(struct kvm *kvm); -void unlock_all_vcpus(struct kvm *kvm); - -static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; - - /* - * num_pri_bits are initialized with HW supported values. - * We can rely safely on num_pri_bits even if VM has not - * restored ICC_CTLR_EL1 before restoring APnR registers. - */ - switch (cpu_if->num_pri_bits) { - case 7: return 3; - case 6: return 1; - default: return 0; - } -} - -static inline bool -vgic_v3_redist_region_full(struct vgic_redist_region *region) -{ - if (!region->count) - return false; - - return (region->free_index >= region->count); -} - -struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs); - -static inline size_t -vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) -{ - if (!rdreg->count) - return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE; - else - return rdreg->count * KVM_VGIC_V3_REDIST_SIZE; -} - -struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, - u32 index); - -bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); - -static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) -{ - struct vgic_dist *d = &kvm->arch.vgic; - - return (base + size > d->vgic_dist_base) && - (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); -} - -int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr); -int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid, struct vgic_irq **irq); -struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); - -bool vgic_supports_direct_msis(struct kvm *kvm); -int vgic_v4_init(struct kvm *kvm); -void vgic_v4_teardown(struct kvm *kvm); -int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu); -int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu); - -#endif diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 110cbe3f74f8..b8aaa96b799b 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kvm asynchronous fault support * @@ -5,19 +6,6 @@ * * Author: * Gleb Natapov <gleb@redhat.com> - * - * This file is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include <linux/kvm_host.h> @@ -29,21 +17,6 @@ #include "async_pf.h" #include <trace/events/kvm.h> -static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ -#ifdef CONFIG_KVM_ASYNC_PF_SYNC - kvm_arch_async_page_present(vcpu, work); -#endif -} -static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ -#ifndef CONFIG_KVM_ASYNC_PF_SYNC - kvm_arch_async_page_present(vcpu, work); -#endif -} - static struct kmem_cache *async_pf_cache; int kvm_async_pf_init(void) @@ -73,50 +46,79 @@ static void async_pf_execute(struct work_struct *work) { struct kvm_async_pf *apf = container_of(work, struct kvm_async_pf, work); - struct mm_struct *mm = apf->mm; struct kvm_vcpu *vcpu = apf->vcpu; + struct mm_struct *mm = vcpu->kvm->mm; unsigned long addr = apf->addr; - gva_t gva = apf->gva; + gpa_t cr2_or_gpa = apf->cr2_or_gpa; int locked = 1; + bool first; might_sleep(); /* - * This work is run asynchronously to the task which owns - * mm and might be done in another context, so we must - * access remotely. + * Attempt to pin the VM's host address space, and simply skip gup() if + * acquiring a pin fail, i.e. if the process is exiting. Note, KVM + * holds a reference to its associated mm_struct until the very end of + * kvm_destroy_vm(), i.e. the struct itself won't be freed before this + * work item is fully processed. */ - down_read(&mm->mmap_sem); - get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL, - &locked); - if (locked) - up_read(&mm->mmap_sem); + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked); + if (locked) + mmap_read_unlock(mm); + mmput(mm); + } - kvm_async_page_present_sync(vcpu, apf); + /* + * Notify and kick the vCPU even if faulting in the page failed, e.g. + * so that the vCPU can retry the fault synchronously. + */ + if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) + kvm_arch_async_page_present(vcpu, apf); spin_lock(&vcpu->async_pf.lock); + first = list_empty(&vcpu->async_pf.done); list_add_tail(&apf->link, &vcpu->async_pf.done); - apf->vcpu = NULL; spin_unlock(&vcpu->async_pf.lock); /* - * apf may be freed by kvm_check_async_pf_completion() after - * this point + * The apf struct may be freed by kvm_check_async_pf_completion() as + * soon as the lock is dropped. Nullify it to prevent improper usage. */ + apf = NULL; - trace_kvm_async_pf_completed(addr, gva); + if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) + kvm_arch_async_page_present_queued(vcpu); - if (swq_has_sleeper(&vcpu->wq)) - swake_up_one(&vcpu->wq); + trace_kvm_async_pf_completed(addr, cr2_or_gpa); - mmput(mm); - kvm_put_kvm(vcpu->kvm); + __kvm_vcpu_wake_up(vcpu); } -void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) +static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work) { - spin_lock(&vcpu->async_pf.lock); + /* + * The async #PF is "done", but KVM must wait for the work item itself, + * i.e. async_pf_execute(), to run to completion. If KVM is a module, + * KVM must ensure *no* code owned by the KVM (the module) can be run + * after the last call to module_put(). Note, flushing the work item + * is always required when the item is taken off the completion queue. + * E.g. even if the vCPU handles the item in the "normal" path, the VM + * could be terminated before async_pf_execute() completes. + * + * Wake all events skip the queue and go straight done, i.e. don't + * need to be flushed (but sanity check that the work wasn't queued). + */ + if (work->wakeup_all) + WARN_ON_ONCE(work->work.func); + else + flush_work(&work->work); + kmem_cache_free(async_pf_cache, work); +} +void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) +{ /* cancel outstanding work queue item */ while (!list_empty(&vcpu->async_pf.queue)) { struct kvm_async_pf *work = @@ -124,32 +126,24 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) typeof(*work), queue); list_del(&work->queue); - /* - * We know it's present in vcpu->async_pf.done, do - * nothing here. - */ - if (!work->vcpu) - continue; - - spin_unlock(&vcpu->async_pf.lock); #ifdef CONFIG_KVM_ASYNC_PF_SYNC flush_work(&work->work); #else - if (cancel_work_sync(&work->work)) { - mmput(work->mm); - kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ + if (cancel_work_sync(&work->work)) kmem_cache_free(async_pf_cache, work); - } #endif - spin_lock(&vcpu->async_pf.lock); } + spin_lock(&vcpu->async_pf.lock); while (!list_empty(&vcpu->async_pf.done)) { struct kvm_async_pf *work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link); list_del(&work->link); - kmem_cache_free(async_pf_cache, work); + + spin_unlock(&vcpu->async_pf.lock); + kvm_flush_and_free_async_pf_work(work); + spin_lock(&vcpu->async_pf.lock); } spin_unlock(&vcpu->async_pf.lock); @@ -161,7 +155,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) struct kvm_async_pf *work; while (!list_empty_careful(&vcpu->async_pf.done) && - kvm_arch_can_inject_async_page_present(vcpu)) { + kvm_arch_can_dequeue_async_page_present(vcpu)) { spin_lock(&vcpu->async_pf.lock); work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link); @@ -169,64 +163,60 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->async_pf.lock); kvm_arch_async_page_ready(vcpu, work); - kvm_async_page_present_async(vcpu, work); + if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) + kvm_arch_async_page_present(vcpu, work); list_del(&work->queue); vcpu->async_pf.queued--; - kmem_cache_free(async_pf_cache, work); + kvm_flush_and_free_async_pf_work(work); } } -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, - struct kvm_arch_async_pf *arch) +/* + * Try to schedule a job to handle page fault asynchronously. Returns 'true' on + * success, 'false' on failure (page fault has to be handled synchronously). + */ +bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch) { struct kvm_async_pf *work; if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) - return 0; + return false; - /* setup delayed work */ + /* Arch specific code should not do async PF in this case */ + if (unlikely(kvm_is_error_hva(hva))) + return false; /* * do alloc nowait since if we are going to sleep anyway we * may as well sleep faulting in page */ - work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN); + work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT); if (!work) - return 0; + return false; work->wakeup_all = false; work->vcpu = vcpu; - work->gva = gva; + work->cr2_or_gpa = cr2_or_gpa; work->addr = hva; work->arch = *arch; - work->mm = current->mm; - mmget(work->mm); - kvm_get_kvm(work->vcpu->kvm); - - /* this can't really happen otherwise gfn_to_pfn_async - would succeed */ - if (unlikely(kvm_is_error_hva(work->addr))) - goto retry_sync; INIT_WORK(&work->work, async_pf_execute); - if (!schedule_work(&work->work)) - goto retry_sync; list_add_tail(&work->queue, &vcpu->async_pf.queue); vcpu->async_pf.queued++; - kvm_arch_async_page_not_present(vcpu, work); - return 1; -retry_sync: - kvm_put_kvm(work->vcpu->kvm); - mmput(work->mm); - kmem_cache_free(async_pf_cache, work); - return 0; + work->notpresent_injected = kvm_arch_async_page_not_present(vcpu, work); + + schedule_work(&work->work); + + return true; } int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) { struct kvm_async_pf *work; + bool first; if (!list_empty_careful(&vcpu->async_pf.done)) return 0; @@ -239,9 +229,13 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&work->queue); /* for list_del to work */ spin_lock(&vcpu->async_pf.lock); + first = list_empty(&vcpu->async_pf.done); list_add_tail(&work->link, &vcpu->async_pf.done); spin_unlock(&vcpu->async_pf.lock); + if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) + kvm_arch_async_page_present_queued(vcpu); + vcpu->async_pf.queued++; return 0; } diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h index ec4cfa278f04..90d1a7d8c6de 100644 --- a/virt/kvm/async_pf.h +++ b/virt/kvm/async_pf.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * kvm asynchronous fault support * @@ -5,19 +6,6 @@ * * Author: * Gleb Natapov <gleb@redhat.com> - * - * This file is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __KVM_ASYNC_PF_H__ diff --git a/virt/kvm/binary_stats.c b/virt/kvm/binary_stats.c new file mode 100644 index 000000000000..eefca6c69f51 --- /dev/null +++ b/virt/kvm/binary_stats.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM binary statistics interface implementation + * + * Copyright 2021 Google LLC + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/errno.h> +#include <linux/uaccess.h> + +/** + * kvm_stats_read() - Common function to read from the binary statistics + * file descriptor. + * + * @id: identification string of the stats + * @header: stats header for a vm or a vcpu + * @desc: start address of an array of stats descriptors for a vm or a vcpu + * @stats: start address of stats data block for a vm or a vcpu + * @size_stats: the size of stats data block pointed by @stats + * @user_buffer: start address of userspace buffer + * @size: requested read size from userspace + * @offset: the start position from which the content will be read for the + * corresponding vm or vcp file descriptor + * + * The file content of a vm/vcpu file descriptor is now defined as below: + * +-------------+ + * | Header | + * +-------------+ + * | id string | + * +-------------+ + * | Descriptors | + * +-------------+ + * | Stats Data | + * +-------------+ + * Although this function allows userspace to read any amount of data (as long + * as in the limit) from any position, the typical usage would follow below + * steps: + * 1. Read header from offset 0. Get the offset of descriptors and stats data + * and some other necessary information. This is a one-time work for the + * lifecycle of the corresponding vm/vcpu stats fd. + * 2. Read id string from its offset. This is a one-time work for the lifecycle + * of the corresponding vm/vcpu stats fd. + * 3. Read descriptors from its offset and discover all the stats by parsing + * descriptors. This is a one-time work for the lifecycle of the + * corresponding vm/vcpu stats fd. + * 4. Periodically read stats data from its offset using pread. + * + * Return: the number of bytes that has been successfully read + */ +ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, + const struct _kvm_stats_desc *desc, + void *stats, size_t size_stats, + char __user *user_buffer, size_t size, loff_t *offset) +{ + ssize_t len; + ssize_t copylen; + ssize_t remain = size; + size_t size_desc; + size_t size_header; + void *src; + loff_t pos = *offset; + char __user *dest = user_buffer; + + size_header = sizeof(*header); + size_desc = header->num_desc * sizeof(*desc); + + len = KVM_STATS_NAME_SIZE + size_header + size_desc + size_stats - pos; + len = min(len, remain); + if (len <= 0) + return 0; + remain = len; + + /* + * Copy kvm stats header. + * The header is the first block of content userspace usually read out. + * The pos is 0 and the copylen and remain would be the size of header. + * The copy of the header would be skipped if offset is larger than the + * size of header. That usually happens when userspace reads stats + * descriptors and stats data. + */ + copylen = size_header - pos; + copylen = min(copylen, remain); + if (copylen > 0) { + src = (void *)header + pos; + if (copy_to_user(dest, src, copylen)) + return -EFAULT; + remain -= copylen; + pos += copylen; + dest += copylen; + } + + /* + * Copy kvm stats header id string. + * The id string is unique for every vm/vcpu, which is stored in kvm + * and kvm_vcpu structure. + * The id string is part of the stat header from the perspective of + * userspace, it is usually read out together with previous constant + * header part and could be skipped for later descriptors and stats + * data readings. + */ + copylen = header->id_offset + KVM_STATS_NAME_SIZE - pos; + copylen = min(copylen, remain); + if (copylen > 0) { + src = id + pos - header->id_offset; + if (copy_to_user(dest, src, copylen)) + return -EFAULT; + remain -= copylen; + pos += copylen; + dest += copylen; + } + + /* + * Copy kvm stats descriptors. + * The descriptors copy would be skipped in the typical case that + * userspace periodically read stats data, since the pos would be + * greater than the end address of descriptors + * (header->header.desc_offset + size_desc) causing copylen <= 0. + */ + copylen = header->desc_offset + size_desc - pos; + copylen = min(copylen, remain); + if (copylen > 0) { + src = (void *)desc + pos - header->desc_offset; + if (copy_to_user(dest, src, copylen)) + return -EFAULT; + remain -= copylen; + pos += copylen; + dest += copylen; + } + + /* Copy kvm stats values */ + copylen = header->data_offset + size_stats - pos; + copylen = min(copylen, remain); + if (copylen > 0) { + src = stats + pos - header->data_offset; + if (copy_to_user(dest, src, copylen)) + return -EFAULT; + pos += copylen; + } + + *offset = pos; + return len; +} diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 6855cce3e528..375d6285475e 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -40,52 +40,40 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, return 1; } -static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev) -{ - struct kvm_coalesced_mmio_ring *ring; - unsigned avail; - - /* Are we able to batch it ? */ - - /* last is the first free entry - * check if we don't meet the first used entry - * there is always one unused entry in the buffer - */ - ring = dev->kvm->coalesced_mmio_ring; - avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; - if (avail == 0) { - /* full */ - return 0; - } - - return 1; -} - static int coalesced_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val) { struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; + __u32 insert; if (!coalesced_mmio_in_range(dev, addr, len)) return -EOPNOTSUPP; spin_lock(&dev->kvm->ring_lock); - if (!coalesced_mmio_has_room(dev)) { + /* + * last is the index of the entry to fill. Verify userspace hasn't + * set last to be out of range, and that there is room in the ring. + * Leave one entry free in the ring so that userspace can differentiate + * between an empty ring and a full ring. + */ + insert = READ_ONCE(ring->last); + if (insert >= KVM_COALESCED_MMIO_MAX || + (insert + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) { spin_unlock(&dev->kvm->ring_lock); return -EOPNOTSUPP; } /* copy data in first free entry of the ring */ - ring->coalesced_mmio[ring->last].phys_addr = addr; - ring->coalesced_mmio[ring->last].len = len; - memcpy(ring->coalesced_mmio[ring->last].data, val, len); - ring->coalesced_mmio[ring->last].pio = dev->zone.pio; + ring->coalesced_mmio[insert].phys_addr = addr; + ring->coalesced_mmio[insert].len = len; + memcpy(ring->coalesced_mmio[insert].data, val, len); + ring->coalesced_mmio[insert].pio = dev->zone.pio; smp_wmb(); - ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; + ring->last = (insert + 1) % KVM_COALESCED_MMIO_MAX; spin_unlock(&dev->kvm->ring_lock); return 0; } @@ -107,26 +95,22 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = { int kvm_coalesced_mmio_init(struct kvm *kvm) { struct page *page; - int ret; - ret = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page) - goto out_err; + return -ENOMEM; - ret = 0; kvm->coalesced_mmio_ring = page_address(page); /* * We're using this spinlock to sync access to the coalesced ring. - * The list doesn't need it's own lock since device registration and + * The list doesn't need its own lock since device registration and * unregistration should only happen when kvm->slots_lock is held. */ spin_lock_init(&kvm->ring_lock); INIT_LIST_HEAD(&kvm->coalesced_zones); -out_err: - return ret; + return 0; } void kvm_coalesced_mmio_free(struct kvm *kvm) @@ -144,7 +128,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, if (zone->pio != 1 && zone->pio != 0) return -EINVAL; - dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); + dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), + GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; @@ -174,21 +159,33 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone) { struct kvm_coalesced_mmio_dev *dev, *tmp; + int r; if (zone->pio != 1 && zone->pio != 0) return -EINVAL; mutex_lock(&kvm->slots_lock); - list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) + list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) { if (zone->pio == dev->zone.pio && coalesced_mmio_in_range(dev, zone->addr, zone->size)) { - kvm_io_bus_unregister_dev(kvm, + r = kvm_io_bus_unregister_dev(kvm, zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev); - kvm_iodevice_destructor(&dev->dev); + /* + * On failure, unregister destroys all devices on the + * bus, including the target device. There's no need + * to restart the walk as there aren't any zones left. + */ + if (r) + break; } + } mutex_unlock(&kvm->slots_lock); + /* + * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's + * perspective, the coalesced MMIO is most definitely unregistered. + */ return 0; } diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c new file mode 100644 index 000000000000..02bc6b00d76c --- /dev/null +++ b/virt/kvm/dirty_ring.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM dirty ring implementation + * + * Copyright 2019 Red Hat, Inc. + */ +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/vmalloc.h> +#include <linux/kvm_dirty_ring.h> +#include <trace/events/kvm.h> +#include "kvm_mm.h" + +int __weak kvm_cpu_dirty_log_size(struct kvm *kvm) +{ + return 0; +} + +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) +{ + return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(kvm); +} + +bool kvm_use_dirty_bitmap(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->slots_lock); + + return !kvm->dirty_ring_size || kvm->dirty_ring_with_bitmap; +} + +#ifndef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) +{ + return false; +} +#endif + +static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring) +{ + return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index); +} + +static bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring) +{ + return kvm_dirty_ring_used(ring) >= ring->soft_limit; +} + +static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring) +{ + return kvm_dirty_ring_used(ring) >= ring->size; +} + +static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) +{ + struct kvm_memory_slot *memslot; + int as_id, id; + + as_id = slot >> 16; + id = (u16)slot; + + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) + return; + + memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); + + if (!memslot || (offset + __fls(mask)) >= memslot->npages) + return; + + KVM_MMU_LOCK(kvm); + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); + KVM_MMU_UNLOCK(kvm); +} + +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size) +{ + ring->dirty_gfns = vzalloc(size); + if (!ring->dirty_gfns) + return -ENOMEM; + + ring->size = size / sizeof(struct kvm_dirty_gfn); + ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(kvm); + ring->dirty_index = 0; + ring->reset_index = 0; + ring->index = index; + + return 0; +} + +static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn) +{ + smp_store_release(&gfn->flags, 0); +} + +static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn) +{ + gfn->flags = KVM_DIRTY_GFN_F_DIRTY; +} + +static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn) +{ + return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET; +} + +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, + int *nr_entries_reset) +{ + /* + * To minimize mmu_lock contention, batch resets for harvested entries + * whose gfns are in the same slot, and are within N frame numbers of + * each other, where N is the number of bits in an unsigned long. For + * simplicity, process the current set of entries when the next entry + * can't be included in the batch. + * + * Track the current batch slot, the gfn offset into the slot for the + * batch, and the bitmask of gfns that need to be reset (relative to + * offset). Note, the offset may be adjusted backwards, e.g. so that + * a sequence of gfns X, X-1, ... X-N-1 can be batched. + */ + u32 cur_slot, next_slot; + u64 cur_offset, next_offset; + unsigned long mask = 0; + struct kvm_dirty_gfn *entry; + + /* + * Ensure concurrent calls to KVM_RESET_DIRTY_RINGS are serialized, + * e.g. so that KVM fully resets all entries processed by a given call + * before returning to userspace. Holding slots_lock also protects + * the various memslot accesses. + */ + lockdep_assert_held(&kvm->slots_lock); + + while (likely((*nr_entries_reset) < INT_MAX)) { + if (signal_pending(current)) + return -EINTR; + + entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)]; + + if (!kvm_dirty_gfn_harvested(entry)) + break; + + next_slot = READ_ONCE(entry->slot); + next_offset = READ_ONCE(entry->offset); + + /* Update the flags to reflect that this GFN is reset */ + kvm_dirty_gfn_set_invalid(entry); + + ring->reset_index++; + (*nr_entries_reset)++; + + if (mask) { + /* + * While the size of each ring is fixed, it's possible + * for the ring to be constantly re-dirtied/harvested + * while the reset is in-progress (the hard limit exists + * only to guard against the count becoming negative). + */ + cond_resched(); + + /* + * Try to coalesce the reset operations when the guest + * is scanning pages in the same slot. + */ + if (next_slot == cur_slot) { + s64 delta = next_offset - cur_offset; + + if (delta >= 0 && delta < BITS_PER_LONG) { + mask |= 1ull << delta; + continue; + } + + /* Backwards visit, careful about overflows! */ + if (delta > -BITS_PER_LONG && delta < 0 && + (mask << -delta >> -delta) == mask) { + cur_offset = next_offset; + mask = (mask << -delta) | 1; + continue; + } + } + + /* + * Reset the slot for all the harvested entries that + * have been gathered, but not yet fully processed. + */ + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + } + + /* + * The current slot was reset or this is the first harvested + * entry, (re)initialize the metadata. + */ + cur_slot = next_slot; + cur_offset = next_offset; + mask = 1; + } + + /* + * Perform a final reset if there are harvested entries that haven't + * been processed, which is guaranteed if at least one harvested was + * found. The loop only performs a reset when the "next" entry can't + * be batched with the "current" entry(s), and that reset processes the + * _current_ entry(s); i.e. the last harvested entry, a.k.a. next, will + * always be left pending. + */ + if (mask) + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + + /* + * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared + * by the VCPU thread next time when it enters the guest. + */ + + trace_kvm_dirty_ring_reset(ring); + + return 0; +} + +void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset) +{ + struct kvm_dirty_ring *ring = &vcpu->dirty_ring; + struct kvm_dirty_gfn *entry; + + /* It should never get full */ + WARN_ON_ONCE(kvm_dirty_ring_full(ring)); + + entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)]; + + entry->slot = slot; + entry->offset = offset; + /* + * Make sure the data is filled in before we publish this to + * the userspace program. There's no paired kernel-side reader. + */ + smp_wmb(); + kvm_dirty_gfn_set_dirtied(entry); + ring->dirty_index++; + trace_kvm_dirty_ring_push(ring, slot, offset); + + if (kvm_dirty_ring_soft_full(ring)) + kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu); +} + +bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu) +{ + /* + * The VCPU isn't runnable when the dirty ring becomes soft full. + * The KVM_REQ_DIRTY_RING_SOFT_FULL event is always set to prevent + * the VCPU from running until the dirty pages are harvested and + * the dirty ring is reset by userspace. + */ + if (kvm_check_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu) && + kvm_dirty_ring_soft_full(&vcpu->dirty_ring)) { + kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu); + vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; + trace_kvm_dirty_ring_exit(vcpu); + return true; + } + + return false; +} + +struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset) +{ + return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE); +} + +void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) +{ + vfree(ring->dirty_gfns); + ring->dirty_gfns = NULL; +} diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index b20b751286fc..0e8b5277be3b 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kvm eventfd support - use eventfd objects to signal various KVM events * @@ -6,19 +7,6 @@ * * Author: * Gregory Haskins <ghaskins@novell.com> - * - * This file is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include <linux/kvm_host.h> @@ -40,10 +28,16 @@ #include <kvm/iodev.h> -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP static struct workqueue_struct *irqfd_cleanup_wq; +bool __attribute__((weak)) +kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) +{ + return true; +} + static void irqfd_inject(struct work_struct *work) { @@ -61,6 +55,15 @@ irqfd_inject(struct work_struct *work) irqfd->gsi, 1, false); } +static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) +{ + struct kvm_kernel_irqfd *irqfd; + + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, + srcu_read_lock_held(&resampler->kvm->irq_srcu)) + eventfd_signal(irqfd->resamplefd); +} + /* * Since resampler irqfds share an IRQ source ID, we de-assert once * then notify all of the resampler irqfds using this GSI. We can't @@ -71,7 +74,6 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) { struct kvm_kernel_irqfd_resampler *resampler; struct kvm *kvm; - struct kvm_kernel_irqfd *irqfd; int idx; resampler = container_of(kian, @@ -82,10 +84,7 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) resampler->notifier.gsi, 0, false); idx = srcu_read_lock(&kvm->irq_srcu); - - list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) - eventfd_signal(irqfd->resamplefd, 1); - + irqfd_resampler_notify(resampler); srcu_read_unlock(&kvm->irq_srcu, idx); } @@ -98,14 +97,19 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) mutex_lock(&kvm->irqfds.resampler_lock); list_del_rcu(&irqfd->resampler_link); - synchronize_srcu(&kvm->irq_srcu); if (list_empty(&resampler->list)) { - list_del(&resampler->link); + list_del_rcu(&resampler->link); kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); + /* + * synchronize_srcu_expedited(&kvm->irq_srcu) already called + * in kvm_unregister_irq_ack_notifier(). + */ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, resampler->notifier.gsi, 0, false); kfree(resampler); + } else { + synchronize_srcu_expedited(&kvm->irq_srcu); } mutex_unlock(&kvm->irqfds.resampler_lock); @@ -122,8 +126,8 @@ irqfd_shutdown(struct work_struct *work) struct kvm *kvm = irqfd->kvm; u64 cnt; - /* Make sure irqfd has been initalized in assign path. */ - synchronize_srcu(&kvm->irq_srcu); + /* Make sure irqfd has been initialized in assign path. */ + synchronize_srcu_expedited(&kvm->irq_srcu); /* * Synchronize with the wait-queue and unhook ourselves to prevent @@ -145,7 +149,7 @@ irqfd_shutdown(struct work_struct *work) /* * It is now safe to release the object's resources */ -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) irq_bypass_unregister_consumer(&irqfd->consumer); #endif eventfd_ctx_put(irqfd->eventfd); @@ -197,8 +201,17 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) struct kvm *kvm = irqfd->kvm; unsigned seq; int idx; + int ret = 0; if (flags & EPOLLIN) { + /* + * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP, + * as KVM holds irqfds.lock when registering the irqfd with the + * eventfd. + */ + u64 cnt; + eventfd_ctx_do_read(irqfd->eventfd, &cnt); + idx = srcu_read_lock(&kvm->irq_srcu); do { seq = read_seqcount_begin(&irqfd->irq_entry_sc); @@ -210,13 +223,19 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) false) == -EWOULDBLOCK) schedule_work(&irqfd->inject); srcu_read_unlock(&kvm->irq_srcu, idx); + ret = 1; } if (flags & EPOLLHUP) { /* The eventfd is closing, detach from KVM */ - unsigned long flags; + unsigned long iflags; - spin_lock_irqsave(&kvm->irqfds.lock, flags); + /* + * Taking irqfds.lock is safe here, as KVM holds a reference to + * the eventfd when registering the irqfd, i.e. this path can't + * be reached while kvm_irqfd_add() is running. + */ + spin_lock_irqsave(&kvm->irqfds.lock, iflags); /* * We must check if someone deactivated the irqfd before @@ -230,28 +249,20 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) if (irqfd_is_active(irqfd)) irqfd_deactivate(irqfd); - spin_unlock_irqrestore(&kvm->irqfds.lock, flags); + spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); } - return 0; -} - -static void -irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, - poll_table *pt) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(pt, struct kvm_kernel_irqfd, pt); - add_wait_queue(wqh, &irqfd->wait); + return ret; } -/* Must be called under irqfds.lock */ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) { struct kvm_kernel_irq_routing_entry *e; struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; int n_entries; + lockdep_assert_held(&kvm->irqfds.lock); + n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); write_seqcount_begin(&irqfd->irq_entry_sc); @@ -265,7 +276,64 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) write_seqcount_end(&irqfd->irq_entry_sc); } -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +struct kvm_irqfd_pt { + struct kvm_kernel_irqfd *irqfd; + struct kvm *kvm; + poll_table pt; + int ret; +}; + +static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); + struct kvm_kernel_irqfd *irqfd = p->irqfd; + struct kvm *kvm = p->kvm; + + /* + * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing, + * and irqfds.items. It does NOT protect registering with the eventfd. + */ + spin_lock_irq(&kvm->irqfds.lock); + + /* + * Initialize the routing information prior to adding the irqfd to the + * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the + * irqfd is registered. + */ + irqfd_update(kvm, irqfd); + + /* + * Add the irqfd as a priority waiter on the eventfd, with a custom + * wake-up handler, so that KVM *and only KVM* is notified whenever the + * underlying eventfd is signaled. + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + + /* + * Temporarily lie to lockdep about holding irqfds.lock to avoid a + * false positive regarding potential deadlock with irqfd_wakeup() + * (see irqfd_wakeup() for details). + * + * Adding to the wait queue will fail if there is already a priority + * waiter, i.e. if the eventfd is associated with another irqfd (in any + * VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown + * jobs to complete, i.e. ensures the irqfd has been removed from the + * eventfd's waitqueue before returning to userspace. + */ + spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); + p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait); + spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); + if (p->ret) + goto out; + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + +out: + spin_unlock_irq(&kvm->irqfds.lock); +} + +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) void __attribute__((weak)) kvm_arch_irq_bypass_stop( struct irq_bypass_consumer *cons) { @@ -276,20 +344,20 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start( { } -int __attribute__((weak)) kvm_arch_update_irqfd_routing( - struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { - return 0; + } #endif static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { - struct kvm_kernel_irqfd *irqfd, *tmp; - struct fd f; + struct kvm_kernel_irqfd *irqfd; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; + struct kvm_irqfd_pt irqfd_pt; int ret; __poll_t events; int idx; @@ -297,7 +365,10 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) if (!kvm_arch_intc_initialized(kvm)) return -EAGAIN; - irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!kvm_arch_irqfd_allowed(kvm, args)) + return -EINVAL; + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); if (!irqfd) return -ENOMEM; @@ -306,18 +377,18 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) INIT_LIST_HEAD(&irqfd->list); INIT_WORK(&irqfd->inject, irqfd_inject); INIT_WORK(&irqfd->shutdown, irqfd_shutdown); - seqcount_init(&irqfd->irq_entry_sc); + seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock); - f = fdget(args->fd); - if (!f.file) { + CLASS(fd, f)(args->fd); + if (fd_empty(f)) { ret = -EBADF; goto out; } - eventfd = eventfd_ctx_fileget(f.file); + eventfd = eventfd_ctx_fileget(fd_file(f)); if (IS_ERR(eventfd)) { ret = PTR_ERR(eventfd); - goto fail; + goto out; } irqfd->eventfd = eventfd; @@ -345,7 +416,8 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) } if (!irqfd->resampler) { - resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); + resampler = kzalloc(sizeof(*resampler), + GFP_KERNEL_ACCOUNT); if (!resampler) { ret = -ENOMEM; mutex_unlock(&kvm->irqfds.resampler_lock); @@ -358,76 +430,67 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) resampler->notifier.irq_acked = irqfd_resampler_ack; INIT_LIST_HEAD(&resampler->link); - list_add(&resampler->link, &kvm->irqfds.resampler_list); + list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list); kvm_register_irq_ack_notifier(kvm, &resampler->notifier); irqfd->resampler = resampler; } list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); mutex_unlock(&kvm->irqfds.resampler_lock); } /* - * Install our own custom wake-up handling so we are notified via - * a callback whenever someone signals the underlying eventfd + * Set the irqfd routing and add it to KVM's list before registering + * the irqfd with the eventfd, so that the routing information is valid + * and stays valid, e.g. if there are GSI routing changes, prior to + * making the irqfd visible, i.e. before it might be signaled. + * + * Note, holding SRCU ensures a stable read of routing information, and + * also prevents irqfd_shutdown() from freeing the irqfd before it's + * fully initialized. */ - init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); - init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); - - spin_lock_irq(&kvm->irqfds.lock); - - ret = 0; - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - /* This fd is used for another irq already. */ - ret = -EBUSY; - spin_unlock_irq(&kvm->irqfds.lock); - goto fail; - } - idx = srcu_read_lock(&kvm->irq_srcu); - irqfd_update(kvm, irqfd); - - list_add_tail(&irqfd->list, &kvm->irqfds.items); - - spin_unlock_irq(&kvm->irqfds.lock); /* - * Check if there was an event already pending on the eventfd - * before we registered, and trigger it as if we didn't miss it. + * Register the irqfd with the eventfd by polling on the eventfd, and + * simultaneously and the irqfd to KVM's list. If there was en event + * pending on the eventfd prior to registering, manually trigger IRQ + * injection. */ - events = vfs_poll(f.file, &irqfd->pt); + irqfd_pt.irqfd = irqfd; + irqfd_pt.kvm = kvm; + init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); + + events = vfs_poll(fd_file(f), &irqfd_pt.pt); + + ret = irqfd_pt.ret; + if (ret) + goto fail_poll; if (events & EPOLLIN) schedule_work(&irqfd->inject); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (kvm_arch_has_irq_bypass()) { - irqfd->consumer.token = (void *)irqfd->eventfd; irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; irqfd->consumer.stop = kvm_arch_irq_bypass_stop; irqfd->consumer.start = kvm_arch_irq_bypass_start; - ret = irq_bypass_register_consumer(&irqfd->consumer); + ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); if (ret) - pr_info("irq bypass consumer (token %p) registration fails: %d\n", - irqfd->consumer.token, ret); + pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", + irqfd->eventfd, ret); } #endif srcu_read_unlock(&kvm->irq_srcu, idx); - - /* - * do not drop the file until the irqfd is fully initialized, otherwise - * we might race against the EPOLLHUP - */ - fdput(f); return 0; +fail_poll: + srcu_read_unlock(&kvm->irq_srcu, idx); fail: if (irqfd->resampler) irqfd_resampler_shutdown(irqfd); @@ -438,8 +501,6 @@ fail: if (eventfd && !IS_ERR(eventfd)) eventfd_ctx_put(eventfd); - fdput(f); - out: kfree(irqfd); return ret; @@ -453,8 +514,8 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) idx = srcu_read_lock(&kvm->irq_srcu); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) + hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, + link, srcu_read_lock_held(&kvm->irq_srcu)) if (kian->gsi == gsi) { srcu_read_unlock(&kvm->irq_srcu, idx); return true; @@ -464,14 +525,14 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) return false; } -EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_irq_has_notifier); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) { struct kvm_irq_ack_notifier *kian; - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) + hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, + link, srcu_read_lock_held(&kvm->irq_srcu)) if (kian->gsi == gsi) kian->irq_acked(kian); } @@ -504,24 +565,10 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, mutex_lock(&kvm->irq_lock); hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); kvm_arch_post_irq_ack_notifier_list_update(kvm); } -#endif - -void -kvm_eventfd_init(struct kvm *kvm) -{ -#ifdef CONFIG_HAVE_KVM_IRQFD - spin_lock_init(&kvm->irqfds.lock); - INIT_LIST_HEAD(&kvm->irqfds.items); - INIT_LIST_HEAD(&kvm->irqfds.resampler_list); - mutex_init(&kvm->irqfds.resampler_lock); -#endif - INIT_LIST_HEAD(&kvm->ioeventfds); -} -#ifdef CONFIG_HAVE_KVM_IRQFD /* * shutdown any irqfd's that match fd+gsi */ @@ -603,7 +650,7 @@ kvm_irqfd_release(struct kvm *kvm) /* * Take note of a change in irq routing. - * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. + * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards. */ void kvm_irq_routing_update(struct kvm *kvm) { @@ -612,21 +659,47 @@ void kvm_irq_routing_update(struct kvm *kvm) spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry(irqfd, &kvm->irqfds.items, list) { +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) + /* Under irqfds.lock, so can read irq_entry safely */ + struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; +#endif + irqfd_update(kvm, irqfd); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS - if (irqfd->producer) { - int ret = kvm_arch_update_irqfd_routing( - irqfd->kvm, irqfd->producer->irq, - irqfd->gsi, 1); - WARN_ON(ret); - } +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) + if (irqfd->producer) + kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); #endif } spin_unlock_irq(&kvm->irqfds.lock); } +bool kvm_notify_irqfd_resampler(struct kvm *kvm, + unsigned int irqchip, + unsigned int pin) +{ + struct kvm_kernel_irqfd_resampler *resampler; + int gsi, idx; + + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); + if (gsi != -1) { + list_for_each_entry_srcu(resampler, + &kvm->irqfds.resampler_list, link, + srcu_read_lock_held(&kvm->irq_srcu)) { + if (resampler->notifier.gsi == gsi) { + irqfd_resampler_notify(resampler); + srcu_read_unlock(&kvm->irq_srcu, idx); + return true; + } + } + } + srcu_read_unlock(&kvm->irq_srcu, idx); + + return false; +} + /* * create a host-wide workqueue for issuing deferred shutdown requests * aggregated from all vm* instances. We need our own isolated @@ -634,7 +707,7 @@ void kvm_irq_routing_update(struct kvm *kvm) */ int kvm_irqfd_init(void) { - irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); + irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", WQ_PERCPU, 0); if (!irqfd_cleanup_wq) return -ENOMEM; @@ -723,7 +796,7 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) return false; } - return _val == p->datamatch ? true : false; + return _val == p->datamatch; } /* MMIO/PIO writes trigger an event if the addr/val match */ @@ -736,7 +809,7 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, if (!ioeventfd_in_range(p, addr, len, val)) return -EOPNOTSUPP; - eventfd_signal(p->eventfd, 1); + eventfd_signal(p->eventfd); return 0; } @@ -797,7 +870,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, if (IS_ERR(eventfd)) return PTR_ERR(eventfd); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); if (!p) { ret = -ENOMEM; goto fail; @@ -839,9 +912,9 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, unlock_fail: mutex_unlock(&kvm->slots_lock); + kfree(p); fail: - kfree(p); eventfd_ctx_put(eventfd); return ret; @@ -851,20 +924,21 @@ static int kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_ioeventfd *args) { - struct _ioeventfd *p, *tmp; + struct _ioeventfd *p; struct eventfd_ctx *eventfd; struct kvm_io_bus *bus; int ret = -ENOENT; + bool wildcard; eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); - mutex_lock(&kvm->slots_lock); + wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); - list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { - bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); + mutex_lock(&kvm->slots_lock); + list_for_each_entry(p, &kvm->ioeventfds, list) { if (p->bus_idx != bus_idx || p->eventfd != eventfd || p->addr != args->addr || @@ -879,7 +953,6 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, bus = kvm_get_bus(kvm, bus_idx); if (bus) bus->ioeventfd_count--; - ioeventfd_release(p); ret = 0; break; } @@ -962,3 +1035,15 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) return kvm_assign_ioeventfd(kvm, args); } + +void +kvm_eventfd_init(struct kvm *kvm) +{ +#ifdef CONFIG_HAVE_KVM_IRQCHIP + spin_lock_init(&kvm->irqfds.lock); + INIT_LIST_HEAD(&kvm->irqfds.items); + INIT_LIST_HEAD(&kvm->irqfds.resampler_list); + mutex_init(&kvm->irqfds.resampler_lock); +#endif + INIT_LIST_HEAD(&kvm->ioeventfds); +} diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c new file mode 100644 index 000000000000..fdaea3422c30 --- /dev/null +++ b/virt/kvm/guest_memfd.c @@ -0,0 +1,1016 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/anon_inodes.h> +#include <linux/backing-dev.h> +#include <linux/falloc.h> +#include <linux/fs.h> +#include <linux/kvm_host.h> +#include <linux/mempolicy.h> +#include <linux/pseudo_fs.h> +#include <linux/pagemap.h> + +#include "kvm_mm.h" + +static struct vfsmount *kvm_gmem_mnt; + +/* + * A guest_memfd instance can be associated multiple VMs, each with its own + * "view" of the underlying physical memory. + * + * The gmem's inode is effectively the raw underlying physical storage, and is + * used to track properties of the physical memory, while each gmem file is + * effectively a single VM's view of that storage, and is used to track assets + * specific to its associated VM, e.g. memslots=>gmem bindings. + */ +struct gmem_file { + struct kvm *kvm; + struct xarray bindings; + struct list_head entry; +}; + +struct gmem_inode { + struct shared_policy policy; + struct inode vfs_inode; + + u64 flags; +}; + +static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) +{ + return container_of(inode, struct gmem_inode, vfs_inode); +} + +#define kvm_gmem_for_each_file(f, mapping) \ + list_for_each_entry(f, &(mapping)->i_private_list, entry) + +/** + * folio_file_pfn - like folio_file_page, but return a pfn. + * @folio: The folio which contains this index. + * @index: The index we want to look up. + * + * Return: The pfn for this index. + */ +static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) +{ + return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); +} + +static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return gfn - slot->base_gfn + slot->gmem.pgoff; +} + +static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, + pgoff_t index, struct folio *folio) +{ +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE + kvm_pfn_t pfn = folio_file_pfn(folio, index); + gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; + int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); + if (rc) { + pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", + index, gfn, pfn, rc); + return rc; + } +#endif + + return 0; +} + +static inline void kvm_gmem_mark_prepared(struct folio *folio) +{ + folio_mark_uptodate(folio); +} + +/* + * Process @folio, which contains @gfn, so that the guest can use it. + * The folio must be locked and the gfn must be contained in @slot. + * On successful return the guest sees a zero page so as to avoid + * leaking host data and the up-to-date flag is set. + */ +static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, struct folio *folio) +{ + unsigned long nr_pages, i; + pgoff_t index; + int r; + + nr_pages = folio_nr_pages(folio); + for (i = 0; i < nr_pages; i++) + clear_highpage(folio_page(folio, i)); + + /* + * Preparing huge folios should always be safe, since it should + * be possible to split them later if needed. + * + * Right now the folio order is always going to be zero, but the + * code is ready for huge folios. The only assumption is that + * the base pgoff of memslots is naturally aligned with the + * requested page order, ensuring that huge folios can also use + * huge page table entries for GPA->HPA mapping. + * + * The order will be passed when creating the guest_memfd, and + * checked when creating memslots. + */ + WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); + index = kvm_gmem_get_index(slot, gfn); + index = ALIGN_DOWN(index, folio_nr_pages(folio)); + r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); + if (!r) + kvm_gmem_mark_prepared(folio); + + return r; +} + +/* + * Returns a locked folio on success. The caller is responsible for + * setting the up-to-date flag before the memory is mapped into the guest. + * There is no backing storage for the memory, so the folio will remain + * up-to-date until it's removed. + * + * Ignore accessed, referenced, and dirty flags. The memory is + * unevictable and there is no storage to write back to. + */ +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +{ + /* TODO: Support huge pages. */ + struct mempolicy *policy; + struct folio *folio; + + /* + * Fast-path: See if folio is already present in mapping to avoid + * policy_lookup. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED, 0); + if (!IS_ERR(folio)) + return folio; + + policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); + folio = __filemap_get_folio_mpol(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping), policy); + mpol_cond_put(policy); + + return folio; +} + +static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) +{ + if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) + return KVM_FILTER_SHARED; + + return KVM_FILTER_PRIVATE; +} + +static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, + pgoff_t end, + enum kvm_gfn_range_filter attr_filter) +{ + bool flush = false, found_memslot = false; + struct kvm_memory_slot *slot; + struct kvm *kvm = f->kvm; + unsigned long index; + + xa_for_each_range(&f->bindings, index, slot, start, end - 1) { + pgoff_t pgoff = slot->gmem.pgoff; + + struct kvm_gfn_range gfn_range = { + .start = slot->base_gfn + max(pgoff, start) - pgoff, + .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, + .slot = slot, + .may_block = true, + .attr_filter = attr_filter, + }; + + if (!found_memslot) { + found_memslot = true; + + KVM_MMU_LOCK(kvm); + kvm_mmu_invalidate_begin(kvm); + } + + flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); + } + + if (flush) + kvm_flush_remote_tlbs(kvm); + + if (found_memslot) + KVM_MMU_UNLOCK(kvm); +} + +static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, + pgoff_t end) +{ + enum kvm_gfn_range_filter attr_filter; + struct gmem_file *f; + + attr_filter = kvm_gmem_get_invalidate_filter(inode); + + kvm_gmem_for_each_file(f, inode->i_mapping) + __kvm_gmem_invalidate_begin(f, start, end, attr_filter); +} + +static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, + pgoff_t end) +{ + struct kvm *kvm = f->kvm; + + if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { + KVM_MMU_LOCK(kvm); + kvm_mmu_invalidate_end(kvm); + KVM_MMU_UNLOCK(kvm); + } +} + +static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, + pgoff_t end) +{ + struct gmem_file *f; + + kvm_gmem_for_each_file(f, inode->i_mapping) + __kvm_gmem_invalidate_end(f, start, end); +} + +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; + + /* + * Bindings must be stable across invalidation to ensure the start+end + * are balanced. + */ + filemap_invalidate_lock(inode->i_mapping); + + kvm_gmem_invalidate_begin(inode, start, end); + + truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); + + kvm_gmem_invalidate_end(inode, start, end); + + filemap_invalidate_unlock(inode->i_mapping); + + return 0; +} + +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t start, index, end; + int r; + + /* Dedicated guest is immutable by default. */ + if (offset + len > i_size_read(inode)) + return -EINVAL; + + filemap_invalidate_lock_shared(mapping); + + start = offset >> PAGE_SHIFT; + end = (offset + len) >> PAGE_SHIFT; + + r = 0; + for (index = start; index < end; ) { + struct folio *folio; + + if (signal_pending(current)) { + r = -EINTR; + break; + } + + folio = kvm_gmem_get_folio(inode, index); + if (IS_ERR(folio)) { + r = PTR_ERR(folio); + break; + } + + index = folio_next_index(folio); + + folio_unlock(folio); + folio_put(folio); + + /* 64-bit only, wrapping the index should be impossible. */ + if (WARN_ON_ONCE(!index)) + break; + + cond_resched(); + } + + filemap_invalidate_unlock_shared(mapping); + + return r; +} + +static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + int ret; + + if (!(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = kvm_gmem_punch_hole(file_inode(file), offset, len); + else + ret = kvm_gmem_allocate(file_inode(file), offset, len); + + if (!ret) + file_modified(file); + return ret; +} + +static int kvm_gmem_release(struct inode *inode, struct file *file) +{ + struct gmem_file *f = file->private_data; + struct kvm_memory_slot *slot; + struct kvm *kvm = f->kvm; + unsigned long index; + + /* + * Prevent concurrent attempts to *unbind* a memslot. This is the last + * reference to the file and thus no new bindings can be created, but + * dereferencing the slot for existing bindings needs to be protected + * against memslot updates, specifically so that unbind doesn't race + * and free the memslot (kvm_gmem_get_file() will return NULL). + * + * Since .release is called only when the reference count is zero, + * after which file_ref_get() and get_file_active() fail, + * kvm_gmem_get_pfn() cannot be using the file concurrently. + * file_ref_put() provides a full barrier, and get_file_active() the + * matching acquire barrier. + */ + mutex_lock(&kvm->slots_lock); + + filemap_invalidate_lock(inode->i_mapping); + + xa_for_each(&f->bindings, index, slot) + WRITE_ONCE(slot->gmem.file, NULL); + + /* + * All in-flight operations are gone and new bindings can be created. + * Zap all SPTEs pointed at by this file. Do not free the backing + * memory, as its lifetime is associated with the inode, not the file. + */ + __kvm_gmem_invalidate_begin(f, 0, -1ul, + kvm_gmem_get_invalidate_filter(inode)); + __kvm_gmem_invalidate_end(f, 0, -1ul); + + list_del(&f->entry); + + filemap_invalidate_unlock(inode->i_mapping); + + mutex_unlock(&kvm->slots_lock); + + xa_destroy(&f->bindings); + kfree(f); + + kvm_put_kvm(kvm); + + return 0; +} + +static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) +{ + /* + * Do not return slot->gmem.file if it has already been closed; + * there might be some time between the last fput() and when + * kvm_gmem_release() clears slot->gmem.file. + */ + return get_file_active(&slot->gmem.file); +} + +DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T), + kvm_gmem_get_file(slot), struct kvm_memory_slot *slot); + +static bool kvm_gmem_supports_mmap(struct inode *inode) +{ + return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; +} + +static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct folio *folio; + vm_fault_t ret = VM_FAULT_LOCKED; + + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) + return VM_FAULT_SIGBUS; + + if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) + return VM_FAULT_SIGBUS; + + folio = kvm_gmem_get_folio(inode, vmf->pgoff); + if (IS_ERR(folio)) { + if (PTR_ERR(folio) == -EAGAIN) + return VM_FAULT_RETRY; + + return vmf_error(PTR_ERR(folio)); + } + + if (WARN_ON_ONCE(folio_test_large(folio))) { + ret = VM_FAULT_SIGBUS; + goto out_folio; + } + + if (!folio_test_uptodate(folio)) { + clear_highpage(folio_page(folio, 0)); + kvm_gmem_mark_prepared(folio); + } + + vmf->page = folio_file_page(folio, vmf->pgoff); + +out_folio: + if (ret != VM_FAULT_LOCKED) { + folio_unlock(folio); + folio_put(folio); + } + + return ret; +} + +#ifdef CONFIG_NUMA +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) +{ + struct inode *inode = file_inode(vma->vm_file); + + return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); +} + +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *pgoff) +{ + struct inode *inode = file_inode(vma->vm_file); + + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); + + /* + * Return the memory policy for this index, or NULL if none is set. + * + * Returning NULL, e.g. instead of the current task's memory policy, is + * important for the .get_policy kernel ABI: it indicates that no + * explicit policy has been set via mbind() for this memory. The caller + * can then replace NULL with the default memory policy instead of the + * current task's memory policy. + */ + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); +} +#endif /* CONFIG_NUMA */ + +static const struct vm_operations_struct kvm_gmem_vm_ops = { + .fault = kvm_gmem_fault_user_mapping, +#ifdef CONFIG_NUMA + .get_policy = kvm_gmem_get_policy, + .set_policy = kvm_gmem_set_policy, +#endif +}; + +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!kvm_gmem_supports_mmap(file_inode(file))) + return -ENODEV; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != + (VM_SHARED | VM_MAYSHARE)) { + return -EINVAL; + } + + vma->vm_ops = &kvm_gmem_vm_ops; + + return 0; +} + +static struct file_operations kvm_gmem_fops = { + .mmap = kvm_gmem_mmap, + .open = generic_file_open, + .release = kvm_gmem_release, + .fallocate = kvm_gmem_fallocate, +}; + +static int kvm_gmem_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} + +static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) +{ + pgoff_t start, end; + + filemap_invalidate_lock_shared(mapping); + + start = folio->index; + end = start + folio_nr_pages(folio); + + kvm_gmem_invalidate_begin(mapping->host, start, end); + + /* + * Do not truncate the range, what action is taken in response to the + * error is userspace's decision (assuming the architecture supports + * gracefully handling memory errors). If/when the guest attempts to + * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, + * at which point KVM can either terminate the VM or propagate the + * error to userspace. + */ + + kvm_gmem_invalidate_end(mapping->host, start, end); + + filemap_invalidate_unlock_shared(mapping); + + return MF_DELAYED; +} + +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE +static void kvm_gmem_free_folio(struct folio *folio) +{ + struct page *page = folio_page(folio, 0); + kvm_pfn_t pfn = page_to_pfn(page); + int order = folio_order(folio); + + kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); +} +#endif + +static const struct address_space_operations kvm_gmem_aops = { + .dirty_folio = noop_dirty_folio, + .migrate_folio = kvm_gmem_migrate_folio, + .error_remove_folio = kvm_gmem_error_folio, +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + .free_folio = kvm_gmem_free_folio, +#endif +}; + +static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EINVAL; +} +static const struct inode_operations kvm_gmem_iops = { + .setattr = kvm_gmem_setattr, +}; + +bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) +{ + return true; +} + +static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) +{ + static const char *name = "[kvm-gmem]"; + struct gmem_file *f; + struct inode *inode; + struct file *file; + int fd, err; + + fd = get_unused_fd_flags(0); + if (fd < 0) + return fd; + + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (!f) { + err = -ENOMEM; + goto err_fd; + } + + /* __fput() will take care of fops_put(). */ + if (!fops_get(&kvm_gmem_fops)) { + err = -ENOENT; + goto err_gmem; + } + + inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto err_fops; + } + + inode->i_op = &kvm_gmem_iops; + inode->i_mapping->a_ops = &kvm_gmem_aops; + inode->i_mode |= S_IFREG; + inode->i_size = size; + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_inaccessible(inode->i_mapping); + /* Unmovable mappings are supposed to be marked unevictable as well. */ + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + + GMEM_I(inode)->flags = flags; + + file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_inode; + } + + file->f_flags |= O_LARGEFILE; + file->private_data = f; + + kvm_get_kvm(kvm); + f->kvm = kvm; + xa_init(&f->bindings); + list_add(&f->entry, &inode->i_mapping->i_private_list); + + fd_install(fd, file); + return fd; + +err_inode: + iput(inode); +err_fops: + fops_put(&kvm_gmem_fops); +err_gmem: + kfree(f); +err_fd: + put_unused_fd(fd); + return err; +} + +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) +{ + loff_t size = args->size; + u64 flags = args->flags; + + if (flags & ~kvm_gmem_get_supported_flags(kvm)) + return -EINVAL; + + if (size <= 0 || !PAGE_ALIGNED(size)) + return -EINVAL; + + return __kvm_gmem_create(kvm, size, flags); +} + +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned int fd, loff_t offset) +{ + loff_t size = slot->npages << PAGE_SHIFT; + unsigned long start, end; + struct gmem_file *f; + struct inode *inode; + struct file *file; + int r = -EINVAL; + + BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); + + file = fget(fd); + if (!file) + return -EBADF; + + if (file->f_op != &kvm_gmem_fops) + goto err; + + f = file->private_data; + if (f->kvm != kvm) + goto err; + + inode = file_inode(file); + + if (offset < 0 || !PAGE_ALIGNED(offset) || + offset + size > i_size_read(inode)) + goto err; + + filemap_invalidate_lock(inode->i_mapping); + + start = offset >> PAGE_SHIFT; + end = start + slot->npages; + + if (!xa_empty(&f->bindings) && + xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { + filemap_invalidate_unlock(inode->i_mapping); + goto err; + } + + /* + * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so + * kvm_gmem_bind() must occur on a new memslot. Because the memslot + * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file. + */ + WRITE_ONCE(slot->gmem.file, file); + slot->gmem.pgoff = start; + if (kvm_gmem_supports_mmap(inode)) + slot->flags |= KVM_MEMSLOT_GMEM_ONLY; + + xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); + filemap_invalidate_unlock(inode->i_mapping); + + /* + * Drop the reference to the file, even on success. The file pins KVM, + * not the other way 'round. Active bindings are invalidated if the + * file is closed before memslots are destroyed. + */ + r = 0; +err: + fput(file); + return r; +} + +static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f) +{ + unsigned long start = slot->gmem.pgoff; + unsigned long end = start + slot->npages; + + xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); + + /* + * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() + * cannot see this memslot. + */ + WRITE_ONCE(slot->gmem.file, NULL); +} + +void kvm_gmem_unbind(struct kvm_memory_slot *slot) +{ + /* + * Nothing to do if the underlying file was _already_ closed, as + * kvm_gmem_release() invalidates and nullifies all bindings. + */ + if (!slot->gmem.file) + return; + + CLASS(gmem_get_file, file)(slot); + + /* + * However, if the file is _being_ closed, then the bindings need to be + * removed as kvm_gmem_release() might not run until after the memslot + * is freed. Note, modifying the bindings is safe even though the file + * is dying as kvm_gmem_release() nullifies slot->gmem.file under + * slots_lock, and only puts its reference to KVM after destroying all + * bindings. I.e. reaching this point means kvm_gmem_release() hasn't + * yet destroyed the bindings or freed the gmem_file, and can't do so + * until the caller drops slots_lock. + */ + if (!file) { + __kvm_gmem_unbind(slot, slot->gmem.file->private_data); + return; + } + + filemap_invalidate_lock(file->f_mapping); + __kvm_gmem_unbind(slot, file->private_data); + filemap_invalidate_unlock(file->f_mapping); +} + +/* Returns a locked folio on success. */ +static struct folio *__kvm_gmem_get_pfn(struct file *file, + struct kvm_memory_slot *slot, + pgoff_t index, kvm_pfn_t *pfn, + bool *is_prepared, int *max_order) +{ + struct file *slot_file = READ_ONCE(slot->gmem.file); + struct gmem_file *f = file->private_data; + struct folio *folio; + + if (file != slot_file) { + WARN_ON_ONCE(slot_file); + return ERR_PTR(-EFAULT); + } + + if (xa_load(&f->bindings, index) != slot) { + WARN_ON_ONCE(xa_load(&f->bindings, index)); + return ERR_PTR(-EIO); + } + + folio = kvm_gmem_get_folio(file_inode(file), index); + if (IS_ERR(folio)) + return folio; + + if (folio_test_hwpoison(folio)) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(-EHWPOISON); + } + + *pfn = folio_file_pfn(folio, index); + if (max_order) + *max_order = 0; + + *is_prepared = folio_test_uptodate(folio); + return folio; +} + +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, struct page **page, + int *max_order) +{ + pgoff_t index = kvm_gmem_get_index(slot, gfn); + struct folio *folio; + bool is_prepared = false; + int r = 0; + + CLASS(gmem_get_file, file)(slot); + if (!file) + return -EFAULT; + + folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + if (!is_prepared) + r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); + + folio_unlock(folio); + + if (!r) + *page = folio_file_page(folio, index); + else + folio_put(folio); + + return r; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); + +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE +long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, + kvm_gmem_populate_cb post_populate, void *opaque) +{ + struct kvm_memory_slot *slot; + void __user *p; + + int ret = 0, max_order; + long i; + + lockdep_assert_held(&kvm->slots_lock); + + if (WARN_ON_ONCE(npages <= 0)) + return -EINVAL; + + slot = gfn_to_memslot(kvm, start_gfn); + if (!kvm_slot_has_gmem(slot)) + return -EINVAL; + + CLASS(gmem_get_file, file)(slot); + if (!file) + return -EFAULT; + + filemap_invalidate_lock(file->f_mapping); + + npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); + for (i = 0; i < npages; i += (1 << max_order)) { + struct folio *folio; + gfn_t gfn = start_gfn + i; + pgoff_t index = kvm_gmem_get_index(slot, gfn); + bool is_prepared = false; + kvm_pfn_t pfn; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + break; + } + + if (is_prepared) { + folio_unlock(folio); + folio_put(folio); + ret = -EEXIST; + break; + } + + folio_unlock(folio); + WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || + (npages - i) < (1 << max_order)); + + ret = -EINVAL; + while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), + KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_ATTRIBUTE_PRIVATE)) { + if (!max_order) + goto put_folio_and_exit; + max_order--; + } + + p = src ? src + i * PAGE_SIZE : NULL; + ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); + if (!ret) + kvm_gmem_mark_prepared(folio); + +put_folio_and_exit: + folio_put(folio); + if (ret) + break; + } + + filemap_invalidate_unlock(file->f_mapping); + + return ret && !i ? ret : i; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); +#endif + +static struct kmem_cache *kvm_gmem_inode_cachep; + +static void kvm_gmem_init_inode_once(void *__gi) +{ + struct gmem_inode *gi = __gi; + + /* + * Note! Don't initialize the inode with anything specific to the + * guest_memfd instance, or that might be specific to how the inode is + * used (from the VFS-layer's perspective). This hook is called only + * during the initial slab allocation, i.e. only fields/state that are + * idempotent across _all_ use of the inode _object_ can be initialized + * at this time! + */ + inode_init_once(&gi->vfs_inode); +} + +static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) +{ + struct gmem_inode *gi; + + gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); + if (!gi) + return NULL; + + mpol_shared_policy_init(&gi->policy, NULL); + + gi->flags = 0; + return &gi->vfs_inode; +} + +static void kvm_gmem_destroy_inode(struct inode *inode) +{ + mpol_free_shared_policy(&GMEM_I(inode)->policy); +} + +static void kvm_gmem_free_inode(struct inode *inode) +{ + kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); +} + +static const struct super_operations kvm_gmem_super_operations = { + .statfs = simple_statfs, + .alloc_inode = kvm_gmem_alloc_inode, + .destroy_inode = kvm_gmem_destroy_inode, + .free_inode = kvm_gmem_free_inode, +}; + +static int kvm_gmem_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx; + + if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) + return -ENOMEM; + + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; + ctx = fc->fs_private; + ctx->ops = &kvm_gmem_super_operations; + + return 0; +} + +static struct file_system_type kvm_gmem_fs = { + .name = "guest_memfd", + .init_fs_context = kvm_gmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int kvm_gmem_init_mount(void) +{ + kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); + + if (IS_ERR(kvm_gmem_mnt)) + return PTR_ERR(kvm_gmem_mnt); + + kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; + return 0; +} + +int kvm_gmem_init(struct module *module) +{ + struct kmem_cache_args args = { + .align = 0, + .ctor = kvm_gmem_init_inode_once, + }; + int ret; + + kvm_gmem_fops.owner = module; + kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", + sizeof(struct gmem_inode), + &args, SLAB_ACCOUNT); + if (!kvm_gmem_inode_cachep) + return -ENOMEM; + + ret = kvm_gmem_init_mount(); + if (ret) { + kmem_cache_destroy(kvm_gmem_inode_cachep); + return ret; + } + return 0; +} + +void kvm_gmem_exit(void) +{ + kern_unmount(kvm_gmem_mnt); + kvm_gmem_mnt = NULL; + rcu_barrier(); + kmem_cache_destroy(kvm_gmem_inode_cachep); +} diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index b1286c4e0712..6ccabfd32287 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * irqchip.c: Common API for in kernel interrupt controllers * Copyright (c) 2007, Intel Corporation. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright (c) 2013, Alexander Graf <agraf@suse.de> * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * * This file is derived from virt/kvm/irq_comm.c. * * Authors: @@ -29,7 +17,6 @@ #include <linux/srcu.h> #include <linux/export.h> #include <trace/events/kvm.h> -#include "irq.h" int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) @@ -62,7 +49,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) { struct kvm_kernel_irq_routing_entry route; - if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) + if (!kvm_arch_irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) return -EINVAL; route.msi.address_lo = msi->address_lo; @@ -144,18 +131,19 @@ static int setup_routing_entry(struct kvm *kvm, { struct kvm_kernel_irq_routing_entry *ei; int r; + u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES); /* * Do not allow GSI to be mapped to the same irqchip more than once. * Allow only one to one mapping between GSI and non-irqchip routing. */ - hlist_for_each_entry(ei, &rt->map[ue->gsi], link) + hlist_for_each_entry(ei, &rt->map[gsi], link) if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return -EINVAL; - e->gsi = ue->gsi; + e->gsi = gsi; e->type = ue->type; r = kvm_set_routing_entry(kvm, e, ue); if (r) @@ -195,9 +183,7 @@ int kvm_set_irq_routing(struct kvm *kvm, nr_rt_entries += 1; - new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)), - GFP_KERNEL); - + new = kzalloc(struct_size(new, map, nr_rt_entries), GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; @@ -208,7 +194,7 @@ int kvm_set_irq_routing(struct kvm *kvm, for (i = 0; i < nr; ++i) { r = -ENOMEM; - e = kzalloc(sizeof(*e), GFP_KERNEL); + e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT); if (!e) goto out; @@ -236,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_arch_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); - kvm_arch_post_irq_routing_update(kvm); - synchronize_srcu_expedited(&kvm->irq_srcu); new = old; @@ -251,3 +235,27 @@ out: return r; } + +/* + * Allocate empty IRQ routing by default so that additional setup isn't needed + * when userspace-driven IRQ routing is activated, and so that kvm->irq_routing + * is guaranteed to be non-NULL. + */ +int kvm_init_irq_routing(struct kvm *kvm) +{ + struct kvm_irq_routing_table *new; + int chip_size; + + new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; + + new->nr_rt_entries = 1; + + chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS; + memset(new->chip, -1, chip_size); + + RCU_INIT_POINTER(kvm->irq_routing, new); + + return 0; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5ecea812cb6a..5fcd401a5897 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1,8 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. + * Kernel-based Virtual Machine (KVM) Hypervisor * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. @@ -10,10 +8,6 @@ * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include <kvm/iodev.h> @@ -51,40 +45,62 @@ #include <linux/slab.h> #include <linux/sort.h> #include <linux/bsearch.h> +#include <linux/io.h> +#include <linux/lockdep.h> +#include <linux/kthread.h> +#include <linux/suspend.h> +#include <linux/rseq.h> #include <asm/processor.h> -#include <asm/io.h> #include <asm/ioctl.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include "coalesced_mmio.h" #include "async_pf.h" +#include "kvm_mm.h" #include "vfio.h" +#include <trace/events/ipi.h> + #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> +#include <linux/kvm_dirty_ring.h> + + /* Worst case buffer size needed for holding an integer. */ #define ITOA_MAX_LEN 12 MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor"); MODULE_LICENSE("GPL"); /* Architectures should define their poll value according to the halt latency */ unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; module_param(halt_poll_ns, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns); /* Default doubles per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_grow); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow); + +/* The start value to grow halt_poll_ns from */ +unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ +module_param(halt_poll_ns_grow_start, uint, 0644); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow_start); -/* Default resets per-vcpu halt_poll_ns . */ -unsigned int halt_poll_ns_shrink; +/* Default halves per-vcpu halt_poll_ns. */ +unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink); + +/* + * Allow direct access (from KVM or the CPU) without MMU notifier protection + * to unpinned pages. + */ +static bool allow_unsafe_mappings; +module_param(allow_unsafe_mappings, bool, 0444); /* * Ordering of locks: @@ -92,24 +108,17 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_SPINLOCK(kvm_lock); -static DEFINE_RAW_SPINLOCK(kvm_count_lock); +DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); -static cpumask_var_t cpus_hardware_enabled; -static int kvm_usage_count; -static atomic_t hardware_enable_failed; - -struct kmem_cache *kvm_vcpu_cache; -EXPORT_SYMBOL_GPL(kvm_vcpu_cache); +static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; +static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); -struct dentry *kvm_debugfs_dir; -EXPORT_SYMBOL_GPL(kvm_debugfs_dir); +static struct dentry *kvm_debugfs_dir; -static int kvm_debugfs_num_entries; -static const struct file_operations *stat_fops_per_vm[]; +static const struct file_operations stat_fops_per_vm; static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); @@ -118,40 +127,36 @@ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #define KVM_COMPAT(c) .compat_ioctl = (c) #else +/* + * For architectures that don't implement a compat infrastructure, + * adopt a double line of defense: + * - Prevent a compat task from opening /dev/kvm + * - If the open has been done by a 64bit task, and the KVM fd + * passed to a compat task, let the ioctls fail. + */ static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { return -EINVAL; } -#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl + +static int kvm_no_compat_open(struct inode *inode, struct file *file) +{ + return is_compat_task() ? -ENODEV : 0; +} +#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ + .open = kvm_no_compat_open #endif -static int hardware_enable_all(void); -static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); - -__visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); - -static bool largepages_enabled = true; - #define KVM_EVENT_CREATE_VM 0 #define KVM_EVENT_DESTROY_VM 1 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; -__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end, bool blockable) -{ - return 0; -} +static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); -bool kvm_is_reserved_pfn(kvm_pfn_t pfn) +__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { - if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)); - - return true; } /* @@ -160,20 +165,23 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) void vcpu_load(struct kvm_vcpu *vcpu) { int cpu = get_cpu(); + + __this_cpu_write(kvm_running_vcpu, vcpu); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); } -EXPORT_SYMBOL_GPL(vcpu_load); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { preempt_disable(); kvm_arch_vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); + __this_cpu_write(kvm_running_vcpu, NULL); preempt_enable(); } -EXPORT_SYMBOL_GPL(vcpu_put); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_put); /* TODO: merge with kvm_arch_vcpu_should_kick */ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) @@ -193,47 +201,68 @@ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) return mode == IN_GUEST_MODE; } -static void ack_flush(void *_completed) +static void ack_kick(void *_completed) { } -static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) +static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait) { - if (unlikely(!cpus)) - cpus = cpu_online_mask; - if (cpumask_empty(cpus)) return false; - smp_call_function_many(cpus, ack_flush, NULL, wait); + smp_call_function_many(cpus, ack_kick, NULL, wait); return true; } +static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req, + struct cpumask *tmp, int current_cpu) +{ + int cpu; + + if (likely(!(req & KVM_REQUEST_NO_ACTION))) + __kvm_make_request(req, vcpu); + + if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) + return; + + /* + * Note, the vCPU could get migrated to a different pCPU at any point + * after kvm_request_needs_ipi(), which could result in sending an IPI + * to the previous pCPU. But, that's OK because the purpose of the IPI + * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is + * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES + * after this point is also OK, as the requirement is only that KVM wait + * for vCPUs that were reading SPTEs _before_ any changes were + * finalized. See kvm_vcpu_kick() for more details on handling requests. + */ + if (kvm_request_needs_ipi(vcpu, req)) { + cpu = READ_ONCE(vcpu->cpu); + if (cpu != -1 && cpu != current_cpu) + __cpumask_set_cpu(cpu, tmp); + } +} + bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, - unsigned long *vcpu_bitmap, cpumask_var_t tmp) + unsigned long *vcpu_bitmap) { - int i, cpu, me; struct kvm_vcpu *vcpu; + struct cpumask *cpus; + int i, me; bool called; me = get_cpu(); - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) - continue; + cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); + cpumask_clear(cpus); - kvm_make_request(req, vcpu); - cpu = vcpu->cpu; - - if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) + for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) { + vcpu = kvm_get_vcpu(kvm, i); + if (!vcpu) continue; - - if (tmp != NULL && cpu != -1 && cpu != me && - kvm_request_needs_ipi(vcpu, req)) - __cpumask_set_cpu(cpu, tmp); + kvm_make_vcpu_request(vcpu, req, cpus, me); } - called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); + called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); return called; @@ -241,25 +270,30 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { - cpumask_var_t cpus; + struct kvm_vcpu *vcpu; + struct cpumask *cpus; + unsigned long i; bool called; + int me; + + me = get_cpu(); - zalloc_cpumask_var(&cpus, GFP_ATOMIC); + cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); + cpumask_clear(cpus); - called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus); + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_make_vcpu_request(vcpu, req, cpus, me); + + called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); + put_cpu(); - free_cpumask_var(cpus); return called; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_make_all_cpus_request); -#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) { - /* - * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in - * kvm_make_all_cpus_request. - */ - long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); + ++kvm->stat.generic.remote_tlb_flush_requests; /* * We want to publish modifications to the page tables before reading @@ -272,170 +306,540 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that * barrier here. */ - if (!kvm_arch_flush_remote_tlb(kvm) + if (!kvm_arch_flush_remote_tlbs(kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) - ++kvm->stat.remote_tlb_flush; - cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); + ++kvm->stat.generic.remote_tlb_flush; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_flush_remote_tlbs); + +void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) +{ + if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages)) + return; + + /* + * Fall back to a flushing entire TLBs if the architecture range-based + * TLB invalidation is unsupported or can't be performed for whatever + * reason. + */ + kvm_flush_remote_tlbs(kvm); +} + +void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, + const struct kvm_memory_slot *memslot) +{ + /* + * All current use cases for flushing the TLBs for a specific memslot + * are related to dirty logging, and many do the TLB flush out of + * mmu_lock. The interaction between the various operations on memslot + * must be serialized by slots_lock to ensure the TLB flush from one + * operation is observed by any other operation on the same memslot. + */ + lockdep_assert_held(&kvm->slots_lock); + kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages); } -EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); -#endif -void kvm_reload_remote_mmus(struct kvm *kvm) +static void kvm_flush_shadow_all(struct kvm *kvm) { - kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); + kvm_arch_flush_shadow_all(kvm); + kvm_arch_guest_memory_reclaimed(kvm); } -int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE +static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, + gfp_t gfp_flags) { - struct page *page; - int r; + void *page; + + gfp_flags |= mc->gfp_zero; + + if (mc->kmem_cache) + return kmem_cache_alloc(mc->kmem_cache, gfp_flags); + + page = (void *)__get_free_page(gfp_flags); + if (page && mc->init_value) + memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64)); + return page; +} + +int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) +{ + gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT; + void *obj; + + if (mc->nobjs >= min) + return 0; + + if (unlikely(!mc->objects)) { + if (WARN_ON_ONCE(!capacity)) + return -EIO; + + /* + * Custom init values can be used only for page allocations, + * and obviously conflict with __GFP_ZERO. + */ + if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero))) + return -EIO; + + mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp); + if (!mc->objects) + return -ENOMEM; + mc->capacity = capacity; + } + + /* It is illegal to request a different capacity across topups. */ + if (WARN_ON_ONCE(mc->capacity != capacity)) + return -EIO; + + while (mc->nobjs < mc->capacity) { + obj = mmu_memory_cache_alloc_obj(mc, gfp); + if (!obj) + return mc->nobjs >= min ? 0 : -ENOMEM; + mc->objects[mc->nobjs++] = obj; + } + return 0; +} + +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) +{ + return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min); +} + +int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) +{ + return mc->nobjs; +} + +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) { + if (mc->kmem_cache) + kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); + else + free_page((unsigned long)mc->objects[--mc->nobjs]); + } + + kvfree(mc->objects); + + mc->objects = NULL; + mc->capacity = 0; +} + +void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) +{ + void *p; + + if (WARN_ON(!mc->nobjs)) + p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); + else + p = mc->objects[--mc->nobjs]; + BUG_ON(!p); + return p; +} +#endif + +static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +{ mutex_init(&vcpu->mutex); vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; - init_swait_queue_head(&vcpu->wq); + rwlock_init(&vcpu->pid_lock); +#ifndef __KVM_HAVE_ARCH_WQP + rcuwait_init(&vcpu->wait); +#endif kvm_async_pf_vcpu_init(vcpu); - vcpu->pre_pcpu = -1; - INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->run = page_address(page); - kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; + vcpu->ready = false; + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + vcpu->last_used_slot = NULL; - r = kvm_arch_vcpu_init(vcpu); - if (r < 0) - goto fail_free_run; - return 0; - -fail_free_run: - free_page((unsigned long)vcpu->run); -fail: - return r; + /* Fill the stats id string for the vcpu */ + snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", + task_pid_nr(current), id); } -EXPORT_SYMBOL_GPL(kvm_vcpu_init); -void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) +static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { + kvm_arch_vcpu_destroy(vcpu); + kvm_dirty_ring_free(&vcpu->dirty_ring); + /* - * no need for rcu_read_lock as VCPU_RUN is the only place that - * will change the vcpu->pid pointer and on uninit all file - * descriptors are already gone. + * No need for rcu_read_lock as VCPU_RUN is the only place that changes + * the vcpu->pid pointer, and at destruction time all file descriptors + * are already gone. */ - put_pid(rcu_dereference_protected(vcpu->pid, 1)); - kvm_arch_vcpu_uninit(vcpu); + put_pid(vcpu->pid); + free_page((unsigned long)vcpu->run); + kmem_cache_free(kvm_vcpu_cache, vcpu); +} + +void kvm_destroy_vcpus(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_vcpu_destroy(vcpu); + xa_erase(&kvm->vcpu_array, i); + + /* + * Assert that the vCPU isn't visible in any way, to ensure KVM + * doesn't trigger a use-after-free if destroying vCPUs results + * in VM-wide request, e.g. to flush remote TLBs when tearing + * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires. + */ + WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i)); + } + + atomic_set(&kvm->online_vcpus, 0); } -EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_destroy_vcpus); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); } -static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, - pte_t pte) +typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); + +typedef void (*on_lock_fn_t)(struct kvm *kvm); + +struct kvm_mmu_notifier_range { + /* + * 64-bit addresses, as KVM notifiers can operate on host virtual + * addresses (unsigned long) and guest physical addresses (64-bit). + */ + u64 start; + u64 end; + union kvm_mmu_notifier_arg arg; + gfn_handler_t handler; + on_lock_fn_t on_lock; + bool flush_on_ret; + bool may_block; + bool lockless; +}; + +/* + * The inner-most helper returns a tuple containing the return value from the + * arch- and action-specific handler, plus a flag indicating whether or not at + * least one memslot was found, i.e. if the handler found guest memory. + * + * Note, most notifiers are averse to booleans, so even though KVM tracks the + * return from arch code as a bool, outer helpers will cast it to an int. :-( + */ +typedef struct kvm_mmu_notifier_return { + bool ret; + bool found_memslot; +} kvm_mn_ret_t; + +/* + * Use a dedicated stub instead of NULL to indicate that there is no callback + * function/handler. The compiler technically can't guarantee that a real + * function will have a non-zero address, and so it will generate code to + * check for !NULL, whereas comparing against a stub will be elided at compile + * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9). + */ +static void kvm_null_fn(void) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int idx; + +} +#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) + +/* Iterate over each memslot intersecting [start, last] (inclusive) range */ +#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \ + for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ + node; \ + node = interval_tree_iter_next(node, start, last)) \ + +static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_mmu_notifier_range *range) +{ + struct kvm_mmu_notifier_return r = { + .ret = false, + .found_memslot = false, + }; + struct kvm_gfn_range gfn_range; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + int i, idx; + + if (WARN_ON_ONCE(range->end <= range->start)) + return r; + + /* A null handler is allowed if and only if on_lock() is provided. */ + if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && + IS_KVM_NULL_FN(range->handler))) + return r; + + /* on_lock will never be called for lockless walks */ + if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock))) + return r; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); - kvm->mmu_notifier_seq++; - if (kvm_set_spte_hva(kvm, address, pte)) + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { + struct interval_tree_node *node; + + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot_in_hva_range(node, slots, + range->start, range->end - 1) { + unsigned long hva_start, hva_end; + + slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]); + hva_start = max_t(unsigned long, range->start, slot->userspace_addr); + hva_end = min_t(unsigned long, range->end, + slot->userspace_addr + (slot->npages << PAGE_SHIFT)); + + /* + * To optimize for the likely case where the address + * range is covered by zero or one memslots, don't + * bother making these conditional (to avoid writes on + * the second or later invocation of the handler). + */ + gfn_range.arg = range->arg; + gfn_range.may_block = range->may_block; + /* + * HVA-based notifications aren't relevant to private + * mappings as they don't have a userspace mapping. + */ + gfn_range.attr_filter = KVM_FILTER_SHARED; + + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_range.start = hva_to_gfn_memslot(hva_start, slot); + gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); + gfn_range.slot = slot; + gfn_range.lockless = range->lockless; + + if (!r.found_memslot) { + r.found_memslot = true; + if (!range->lockless) { + KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm); + + if (IS_KVM_NULL_FN(range->handler)) + goto mmu_unlock; + } + } + r.ret |= range->handler(kvm, &gfn_range); + } + } + + if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); +mmu_unlock: + if (r.found_memslot && !range->lockless) + KVM_MMU_UNLOCK(kvm); + srcu_read_unlock(&kvm->srcu, idx); + + return r; } -static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) +static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + gfn_handler_t handler, + bool flush_on_ret) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush = 0, idx; - int ret; + const struct kvm_mmu_notifier_range range = { + .start = start, + .end = end, + .handler = handler, + .on_lock = (void *)kvm_null_fn, + .flush_on_ret = flush_on_ret, + .may_block = false, + .lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING), + }; - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + return kvm_handle_hva_range(kvm, &range).ret; +} + +static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + gfn_handler_t handler) +{ + return kvm_age_hva_range(mn, start, end, handler, false); +} + +void kvm_mmu_invalidate_begin(struct kvm *kvm) +{ + lockdep_assert_held_write(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and * count is also read inside the mmu_lock critical section. */ - kvm->mmu_notifier_count++; - need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end); - need_tlb_flush |= kvm->tlbs_dirty; - /* we've to flush the tlb before the pages can be freed */ - if (need_tlb_flush) - kvm_flush_remote_tlbs(kvm); + kvm->mmu_invalidate_in_progress++; - spin_unlock(&kvm->mmu_lock); + if (likely(kvm->mmu_invalidate_in_progress == 1)) { + kvm->mmu_invalidate_range_start = INVALID_GPA; + kvm->mmu_invalidate_range_end = INVALID_GPA; + } +} - ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, - range->end, range->blockable); +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end) +{ + lockdep_assert_held_write(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress); - return ret; + if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) { + kvm->mmu_invalidate_range_start = start; + kvm->mmu_invalidate_range_end = end; + } else { + /* + * Fully tracking multiple concurrent ranges has diminishing + * returns. Keep things simple and just find the minimal range + * which includes the current and new ranges. As there won't be + * enough information to subtract a range after its invalidate + * completes, any ranges invalidated concurrently will + * accumulate and persist until all outstanding invalidates + * complete. + */ + kvm->mmu_invalidate_range_start = + min(kvm->mmu_invalidate_range_start, start); + kvm->mmu_invalidate_range_end = + max(kvm->mmu_invalidate_range_end, end); + } } -static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + kvm_mmu_invalidate_range_add(kvm, range->start, range->end); + return kvm_unmap_gfn_range(kvm, range); +} + +static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); + const struct kvm_mmu_notifier_range hva_range = { + .start = range->start, + .end = range->end, + .handler = kvm_mmu_unmap_gfn_range, + .on_lock = kvm_mmu_invalidate_begin, + .flush_on_ret = true, + .may_block = mmu_notifier_range_blockable(range), + }; + + trace_kvm_unmap_hva_range(range->start, range->end); + + /* + * Prevent memslot modification between range_start() and range_end() + * so that conditionally locking provides the same result in both + * functions. Without that guarantee, the mmu_invalidate_in_progress + * adjustments will be imbalanced. + * + * Pairs with the decrement in range_end(). + */ + spin_lock(&kvm->mn_invalidate_lock); + kvm->mn_active_invalidate_count++; + spin_unlock(&kvm->mn_invalidate_lock); + + /* + * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e. + * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring + * each cache's lock. There are relatively few caches in existence at + * any given time, and the caches themselves can check for hva overlap, + * i.e. don't need to rely on memslot overlap checks for performance. + * Because this runs without holding mmu_lock, the pfn caches must use + * mn_active_invalidate_count (see above) instead of + * mmu_invalidate_in_progress. + */ + gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end); + + /* + * If one or more memslots were found and thus zapped, notify arch code + * that guest memory has been reclaimed. This needs to be done *after* + * dropping mmu_lock, as x86's reclaim path is slooooow. + */ + if (kvm_handle_hva_range(kvm, &hva_range).found_memslot) + kvm_arch_guest_memory_reclaimed(kvm); + + return 0; +} + +void kvm_mmu_invalidate_end(struct kvm *kvm) +{ + lockdep_assert_held_write(&kvm->mmu_lock); - spin_lock(&kvm->mmu_lock); /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have * been freed. */ - kvm->mmu_notifier_seq++; + kvm->mmu_invalidate_seq++; smp_wmb(); /* * The above sequence increase must be visible before the * below count decrease, which is ensured by the smp_wmb above - * in conjunction with the smp_rmb in mmu_notifier_retry(). + * in conjunction with the smp_rmb in mmu_invalidate_retry(). */ - kvm->mmu_notifier_count--; - spin_unlock(&kvm->mmu_lock); + kvm->mmu_invalidate_in_progress--; + KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm); - BUG_ON(kvm->mmu_notifier_count < 0); + /* + * Assert that at least one range was added between start() and end(). + * Not adding a range isn't fatal, but it is a KVM bug. + */ + WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA); } -static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end) +static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young, idx; + const struct kvm_mmu_notifier_range hva_range = { + .start = range->start, + .end = range->end, + .handler = (void *)kvm_null_fn, + .on_lock = kvm_mmu_invalidate_end, + .flush_on_ret = false, + .may_block = mmu_notifier_range_blockable(range), + }; + bool wake; - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + kvm_handle_hva_range(kvm, &hva_range); - young = kvm_age_hva(kvm, start, end); - if (young) - kvm_flush_remote_tlbs(kvm); + /* Pairs with the increment in range_start(). */ + spin_lock(&kvm->mn_invalidate_lock); + if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count)) + --kvm->mn_active_invalidate_count; + wake = !kvm->mn_active_invalidate_count; + spin_unlock(&kvm->mn_invalidate_lock); - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + /* + * There can only be one waiter, since the wait happens under + * slots_lock. + */ + if (wake) + rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); +} + +static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + trace_kvm_age_hva(start, end); - return young; + return kvm_age_hva_range(mn, start, end, kvm_age_gfn, + !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, @@ -443,11 +847,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, unsigned long start, unsigned long end) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young, idx; + trace_kvm_age_hva(start, end); - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); /* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is @@ -461,27 +862,17 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ - young = kvm_age_hva(kvm, start, end); - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - - return young; + return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn); } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young, idx; - - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); - young = kvm_test_age_hva(kvm, address); - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + trace_kvm_test_age_hva(address); - return young; + return kvm_age_hva_range_no_flush(mn, address, address + 1, + kvm_test_age_gfn); } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, @@ -491,7 +882,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, int idx; idx = srcu_read_lock(&kvm->srcu); - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); } @@ -501,7 +892,6 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .clear_flush_young = kvm_mmu_notifier_clear_flush_young, .clear_young = kvm_mmu_notifier_clear_young, .test_young = kvm_mmu_notifier_test_young, - .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; @@ -511,71 +901,108 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } -#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ +#else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */ static int kvm_init_mmu_notifier(struct kvm *kvm) { return 0; } -#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ +#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */ -static struct kvm_memslots *kvm_alloc_memslots(void) +#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER +static int kvm_pm_notifier_call(struct notifier_block *bl, + unsigned long state, + void *unused) { - int i; - struct kvm_memslots *slots; + struct kvm *kvm = container_of(bl, struct kvm, pm_notifier); - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); - if (!slots) - return NULL; + return kvm_arch_pm_notifier(kvm, state); +} - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) - slots->id_to_index[i] = slots->memslots[i].id = i; +static void kvm_init_pm_notifier(struct kvm *kvm) +{ + kvm->pm_notifier.notifier_call = kvm_pm_notifier_call; + /* Suspend KVM before we suspend ftrace, RCU, etc. */ + kvm->pm_notifier.priority = INT_MAX; + register_pm_notifier(&kvm->pm_notifier); +} - return slots; +static void kvm_destroy_pm_notifier(struct kvm *kvm) +{ + unregister_pm_notifier(&kvm->pm_notifier); +} +#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */ +static void kvm_init_pm_notifier(struct kvm *kvm) +{ +} + +static void kvm_destroy_pm_notifier(struct kvm *kvm) +{ } +#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) { if (!memslot->dirty_bitmap) return; - kvfree(memslot->dirty_bitmap); + vfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; } -/* - * Free any memory in @free but not in @dont. - */ -static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +/* This does not remove the slot from struct kvm_memslots data structures */ +static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - if (!dont || free->dirty_bitmap != dont->dirty_bitmap) - kvm_destroy_dirty_bitmap(free); + if (slot->flags & KVM_MEM_GUEST_MEMFD) + kvm_gmem_unbind(slot); + + kvm_destroy_dirty_bitmap(slot); - kvm_arch_free_memslot(kvm, free, dont); + kvm_arch_free_memslot(kvm, slot); - free->npages = 0; + kfree(slot); } static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) { + struct hlist_node *idnode; struct kvm_memory_slot *memslot; + int bkt; - if (!slots) + /* + * The same memslot objects live in both active and inactive sets, + * arbitrarily free using index '1' so the second invocation of this + * function isn't operating over a structure with dangling pointers + * (even though this function isn't actually touching them). + */ + if (!slots->node_idx) return; - kvm_for_each_memslot(memslot, slots) - kvm_free_memslot(kvm, memslot, NULL); + hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1]) + kvm_free_memslot(kvm, memslot); +} - kvfree(slots); +static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc) +{ + switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) { + case KVM_STATS_TYPE_INSTANT: + return 0444; + case KVM_STATS_TYPE_CUMULATIVE: + case KVM_STATS_TYPE_PEAK: + default: + return 0644; + } } + static void kvm_destroy_vm_debugfs(struct kvm *kvm) { int i; + int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + + kvm_vcpu_stats_header.num_desc; - if (!kvm->debugfs_dentry) + if (IS_ERR(kvm->debugfs_dentry)) return; debugfs_remove_recursive(kvm->debugfs_dentry); @@ -587,119 +1014,236 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm) } } -static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) +static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname) { + static DEFINE_MUTEX(kvm_debugfs_lock); + struct dentry *dent; char dir_name[ITOA_MAX_LEN * 2]; struct kvm_stat_data *stat_data; - struct kvm_stats_debugfs_item *p; + const struct _kvm_stats_desc *pdesc; + int i, ret = -ENOMEM; + int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + + kvm_vcpu_stats_header.num_desc; if (!debugfs_initialized()) return 0; - snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); - kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); + snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname); + mutex_lock(&kvm_debugfs_lock); + dent = debugfs_lookup(dir_name, kvm_debugfs_dir); + if (dent) { + pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name); + dput(dent); + mutex_unlock(&kvm_debugfs_lock); + return 0; + } + dent = debugfs_create_dir(dir_name, kvm_debugfs_dir); + mutex_unlock(&kvm_debugfs_lock); + if (IS_ERR(dent)) + return 0; + kvm->debugfs_dentry = dent; kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, sizeof(*kvm->debugfs_stat_data), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!kvm->debugfs_stat_data) - return -ENOMEM; + goto out_err; - for (p = debugfs_entries; p->name; p++) { - stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); + for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { + pdesc = &kvm_vm_stats_desc[i]; + stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) - return -ENOMEM; + goto out_err; + + stat_data->kvm = kvm; + stat_data->desc = pdesc; + stat_data->kind = KVM_STAT_VM; + kvm->debugfs_stat_data[i] = stat_data; + debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), + kvm->debugfs_dentry, stat_data, + &stat_fops_per_vm); + } + + for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { + pdesc = &kvm_vcpu_stats_desc[i]; + stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); + if (!stat_data) + goto out_err; stat_data->kvm = kvm; - stat_data->offset = p->offset; - kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, - stat_data, stat_fops_per_vm[p->kind]); + stat_data->desc = pdesc; + stat_data->kind = KVM_STAT_VCPU; + kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data; + debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), + kvm->debugfs_dentry, stat_data, + &stat_fops_per_vm); } + + kvm_arch_create_vm_debugfs(kvm); return 0; +out_err: + kvm_destroy_vm_debugfs(kvm); + return ret; } -static struct kvm *kvm_create_vm(unsigned long type) +/* + * Called just after removing the VM from the vm_list, but before doing any + * other destruction. + */ +void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ +} + +/* + * Called after per-vm debugfs created. When called kvm->debugfs_dentry should + * be setup already, so we can create arch-specific debugfs entries under it. + * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so + * a per-arch destroy interface is not needed. + */ +void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ +} + +/* Called only on cleanup and destruction paths when there are no users. */ +static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm, + enum kvm_bus idx) +{ + return rcu_dereference_protected(kvm->buses[idx], + !refcount_read(&kvm->users_count)); +} + +static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { - int r, i; struct kvm *kvm = kvm_arch_alloc_vm(); + struct kvm_memslots *slots; + int r, i, j; if (!kvm) return ERR_PTR(-ENOMEM); - spin_lock_init(&kvm->mmu_lock); + KVM_MMU_LOCK_INIT(kvm); mmgrab(current->mm); kvm->mm = current->mm; kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); - refcount_set(&kvm->users_count, 1); - INIT_LIST_HEAD(&kvm->devices); + mutex_init(&kvm->slots_arch_lock); + spin_lock_init(&kvm->mn_invalidate_lock); + rcuwait_init(&kvm->mn_memslots_update_rcuwait); + xa_init(&kvm->vcpu_array); +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + xa_init(&kvm->mem_attr_array); +#endif - r = kvm_arch_init_vm(kvm, type); - if (r) - goto out_err_no_disable; + INIT_LIST_HEAD(&kvm->gpc_list); + spin_lock_init(&kvm->gpc_lock); - r = hardware_enable_all(); - if (r) - goto out_err_no_disable; - -#ifdef CONFIG_HAVE_KVM_IRQFD - INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); -#endif + INIT_LIST_HEAD(&kvm->devices); + kvm->max_vcpus = KVM_MAX_VCPUS; BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); - r = -ENOMEM; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_memslots *slots = kvm_alloc_memslots(); - if (!slots) - goto out_err_no_srcu; - /* - * Generations must be different for each address space. - * Init kvm generation close to the maximum to easily test the - * code of handling generation number wrap-around. - */ - slots->generation = i * 2 - 150; - rcu_assign_pointer(kvm->memslots[i], slots); - } + /* + * Force subsequent debugfs file creations to fail if the VM directory + * is not created (by kvm_create_vm_debugfs()). + */ + kvm->debugfs_dentry = ERR_PTR(-ENOENT); + + snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d", + task_pid_nr(current)); + r = -ENOMEM; if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) goto out_err_no_irq_srcu; + + r = kvm_init_irq_routing(kvm); + if (r) + goto out_err_no_irq_routing; + + refcount_set(&kvm->users_count, 1); + + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { + for (j = 0; j < 2; j++) { + slots = &kvm->__memslots[i][j]; + + atomic_long_set(&slots->last_used_slot, (unsigned long)NULL); + slots->hva_tree = RB_ROOT_CACHED; + slots->gfn_tree = RB_ROOT; + hash_init(slots->id_hash); + slots->node_idx = j; + + /* Generations must be different for each address space. */ + slots->generation = i; + } + + rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]); + } + + r = -ENOMEM; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], - kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); + kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) - goto out_err; + goto out_err_no_arch_destroy_vm; } + r = kvm_arch_init_vm(kvm, type); + if (r) + goto out_err_no_arch_destroy_vm; + + r = kvm_enable_virtualization(); + if (r) + goto out_err_no_disable; + +#ifdef CONFIG_HAVE_KVM_IRQCHIP + INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); +#endif + r = kvm_init_mmu_notifier(kvm); if (r) - goto out_err; + goto out_err_no_mmu_notifier; + + r = kvm_coalesced_mmio_init(kvm); + if (r < 0) + goto out_no_coalesced_mmio; + + r = kvm_create_vm_debugfs(kvm, fdname); + if (r) + goto out_err_no_debugfs; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); preempt_notifier_inc(); + kvm_init_pm_notifier(kvm); return kvm; -out_err: +out_err_no_debugfs: + kvm_coalesced_mmio_free(kvm); +out_no_coalesced_mmio: +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER + if (kvm->mmu_notifier.ops) + mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); +#endif +out_err_no_mmu_notifier: + kvm_disable_virtualization(); +out_err_no_disable: + kvm_arch_destroy_vm(kvm); +out_err_no_arch_destroy_vm: + WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); + for (i = 0; i < KVM_NR_BUSES; i++) + kfree(kvm_get_bus_for_destruction(kvm, i)); + kvm_free_irq_routing(kvm); +out_err_no_irq_routing: cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); out_err_no_srcu: - hardware_disable_all(); -out_err_no_disable: - refcount_set(&kvm->users_count, 0); - for (i = 0; i < KVM_NR_BUSES; i++) - kfree(kvm_get_bus(kvm, i)); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); @@ -713,6 +1257,12 @@ static void kvm_destroy_devices(struct kvm *kvm) * We do not need to take the kvm->lock here, because nobody else * has a reference to the struct kvm at this point and therefore * cannot access the devices list anyhow. + * + * The device list is generally managed as an rculist, but list_del() + * is used intentionally here. If a bug in KVM introduced a reader that + * was not backed by a reference on the kvm struct, the hope is that + * it'd consume the poisoned forward pointer instead of suffering a + * use-after-free, even though this cannot be guaranteed. */ list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { list_del(&dev->vm_node); @@ -725,35 +1275,60 @@ static void kvm_destroy_vm(struct kvm *kvm) int i; struct mm_struct *mm = kvm->mm; + kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); - kvm_arch_sync_events(kvm); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_del(&kvm->vm_list); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); + kvm_arch_pre_destroy_vm(kvm); + kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { - struct kvm_io_bus *bus = kvm_get_bus(kvm, i); + struct kvm_io_bus *bus = kvm_get_bus_for_destruction(kvm, i); if (bus) kvm_io_bus_destroy(bus); kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); + /* + * At this point, pending calls to invalidate_range_start() + * have completed but no more MMU notifiers will run, so + * mn_active_invalidate_count may remain unbalanced. + * No threads can be waiting in kvm_swap_active_memslots() as the + * last reference on KVM has been dropped, but freeing + * memslots would deadlock without this manual intervention. + * + * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU + * notifier between a start() and end(), then there shouldn't be any + * in-progress invalidations. + */ + WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); + if (kvm->mn_active_invalidate_count) + kvm->mn_active_invalidate_count = 0; + else + WARN_ON(kvm->mmu_invalidate_in_progress); #else - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { + kvm_free_memslots(kvm, &kvm->__memslots[i][0]); + kvm_free_memslots(kvm, &kvm->__memslots[i][1]); + } cleanup_srcu_struct(&kvm->irq_srcu); + srcu_barrier(&kvm->srcu); cleanup_srcu_struct(&kvm->srcu); +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + xa_destroy(&kvm->mem_attr_array); +#endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); - hardware_disable_all(); + kvm_disable_virtualization(); mmdrop(mm); } @@ -763,6 +1338,16 @@ void kvm_get_kvm(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_get_kvm); +/* + * Make sure the vm is not during destruction, which is a safe version of + * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise. + */ +bool kvm_get_kvm_safe(struct kvm *kvm) +{ + return refcount_inc_not_zero(&kvm->users_count); +} +EXPORT_SYMBOL_GPL(kvm_get_kvm_safe); + void kvm_put_kvm(struct kvm *kvm) { if (refcount_dec_and_test(&kvm->users_count)) @@ -770,6 +1355,18 @@ void kvm_put_kvm(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_put_kvm); +/* + * Used to put a reference that was taken on behalf of an object associated + * with a user-visible file descriptor, e.g. a vcpu or device, if installation + * of the new file descriptor fails and the reference cannot be transferred to + * its final owner. In such cases, the caller is still actively using @kvm and + * will fail miserably if the refcount unexpectedly hits zero. + */ +void kvm_put_kvm_no_destroy(struct kvm *kvm) +{ + WARN_ON(refcount_dec_and_test(&kvm->users_count)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_put_kvm_no_destroy); static int kvm_vm_release(struct inode *inode, struct file *filp) { @@ -781,88 +1378,241 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } +int kvm_trylock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i, j; + + lockdep_assert_held(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) + if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock)) + goto out_unlock; + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + mutex_unlock(&vcpu->mutex); + } + return -EINTR; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_trylock_all_vcpus); + +int kvm_lock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i, j; + int r; + + lockdep_assert_held(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + r = mutex_lock_killable_nest_lock(&vcpu->mutex, &kvm->lock); + if (r) + goto out_unlock; + } + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + mutex_unlock(&vcpu->mutex); + } + return r; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lock_all_vcpus); + +void kvm_unlock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) + mutex_unlock(&vcpu->mutex); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_unlock_all_vcpus); + /* * Allocation size is twice as large as the actual dirty bitmap size. - * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. + * See kvm_vm_ioctl_get_dirty_log() why this is needed. */ -static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) +static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) { - unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); + unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot); - memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); + memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT); if (!memslot->dirty_bitmap) return -ENOMEM; return 0; } +static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id) +{ + struct kvm_memslots *active = __kvm_memslots(kvm, as_id); + int node_idx_inactive = active->node_idx ^ 1; + + return &kvm->__memslots[as_id][node_idx_inactive]; +} + /* - * Insert memslot and re-sort memslots based on their GFN, - * so binary search could be used to lookup GFN. - * Sorting algorithm takes advantage of having initially - * sorted array and known changed memslot position. + * Helper to get the address space ID when one of memslot pointers may be NULL. + * This also serves as a sanity that at least one of the pointers is non-NULL, + * and that their address space IDs don't diverge. */ -static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *new, - enum kvm_mr_change change) +static int kvm_memslots_get_as_id(struct kvm_memory_slot *a, + struct kvm_memory_slot *b) { - int id = new->id; - int i = slots->id_to_index[id]; - struct kvm_memory_slot *mslots = slots->memslots; + if (WARN_ON_ONCE(!a && !b)) + return 0; - WARN_ON(mslots[i].id != id); - switch (change) { - case KVM_MR_CREATE: - slots->used_slots++; - WARN_ON(mslots[i].npages || !new->npages); - break; - case KVM_MR_DELETE: - slots->used_slots--; - WARN_ON(new->npages || !mslots[i].npages); - break; - default: - break; + if (!a) + return b->as_id; + if (!b) + return a->as_id; + + WARN_ON_ONCE(a->as_id != b->as_id); + return a->as_id; +} + +static void kvm_insert_gfn_node(struct kvm_memslots *slots, + struct kvm_memory_slot *slot) +{ + struct rb_root *gfn_tree = &slots->gfn_tree; + struct rb_node **node, *parent; + int idx = slots->node_idx; + + parent = NULL; + for (node = &gfn_tree->rb_node; *node; ) { + struct kvm_memory_slot *tmp; + + tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]); + parent = *node; + if (slot->base_gfn < tmp->base_gfn) + node = &(*node)->rb_left; + else if (slot->base_gfn > tmp->base_gfn) + node = &(*node)->rb_right; + else + BUG(); } - while (i < KVM_MEM_SLOTS_NUM - 1 && - new->base_gfn <= mslots[i + 1].base_gfn) { - if (!mslots[i + 1].npages) - break; - mslots[i] = mslots[i + 1]; - slots->id_to_index[mslots[i].id] = i; - i++; + rb_link_node(&slot->gfn_node[idx], parent, node); + rb_insert_color(&slot->gfn_node[idx], gfn_tree); +} + +static void kvm_erase_gfn_node(struct kvm_memslots *slots, + struct kvm_memory_slot *slot) +{ + rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree); +} + +static void kvm_replace_gfn_node(struct kvm_memslots *slots, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) +{ + int idx = slots->node_idx; + + WARN_ON_ONCE(old->base_gfn != new->base_gfn); + + rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx], + &slots->gfn_tree); +} + +/* + * Replace @old with @new in the inactive memslots. + * + * With NULL @old this simply adds @new. + * With NULL @new this simply removes @old. + * + * If @new is non-NULL its hva_node[slots_idx] range has to be set + * appropriately. + */ +static void kvm_replace_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) +{ + int as_id = kvm_memslots_get_as_id(old, new); + struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); + int idx = slots->node_idx; + + if (old) { + hash_del(&old->id_node[idx]); + interval_tree_remove(&old->hva_node[idx], &slots->hva_tree); + + if ((long)old == atomic_long_read(&slots->last_used_slot)) + atomic_long_set(&slots->last_used_slot, (long)new); + + if (!new) { + kvm_erase_gfn_node(slots, old); + return; + } } /* - * The ">=" is needed when creating a slot with base_gfn == 0, - * so that it moves before all those with base_gfn == npages == 0. - * - * On the other hand, if new->npages is zero, the above loop has - * already left i pointing to the beginning of the empty part of - * mslots, and the ">=" would move the hole backwards in this - * case---which is wrong. So skip the loop when deleting a slot. + * Initialize @new's hva range. Do this even when replacing an @old + * slot, kvm_copy_memslot() deliberately does not touch node data. */ - if (new->npages) { - while (i > 0 && - new->base_gfn >= mslots[i - 1].base_gfn) { - mslots[i] = mslots[i - 1]; - slots->id_to_index[mslots[i].id] = i; - i--; - } - } else - WARN_ON_ONCE(i != slots->used_slots); + new->hva_node[idx].start = new->userspace_addr; + new->hva_node[idx].last = new->userspace_addr + + (new->npages << PAGE_SHIFT) - 1; + + /* + * (Re)Add the new memslot. There is no O(1) interval_tree_replace(), + * hva_node needs to be swapped with remove+insert even though hva can't + * change when replacing an existing slot. + */ + hash_add(slots->id_hash, &new->id_node[idx], new->id); + interval_tree_insert(&new->hva_node[idx], &slots->hva_tree); - mslots[i] = *new; - slots->id_to_index[mslots[i].id] = i; + /* + * If the memslot gfn is unchanged, rb_replace_node() can be used to + * switch the node in the gfn tree instead of removing the old and + * inserting the new as two separate operations. Replacement is a + * single O(1) operation versus two O(log(n)) operations for + * remove+insert. + */ + if (old && old->base_gfn == new->base_gfn) { + kvm_replace_gfn_node(slots, old, new); + } else { + if (old) + kvm_erase_gfn_node(slots, old); + kvm_insert_gfn_node(slots, new); + } } -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) +/* + * Flags that do not access any of the extra space of struct + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS + * only allows these. + */ +#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) + +static int check_memory_region_flags(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; -#ifdef __KVM_HAVE_READONLY_MEM - valid_flags |= KVM_MEM_READONLY; -#endif + if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD)) + valid_flags |= KVM_MEM_GUEST_MEMFD; + + /* Dirty logging private memory is not currently supported. */ + if (mem->flags & KVM_MEM_GUEST_MEMFD) + valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + + /* + * GUEST_MEMFD is incompatible with read-only memslots, as writes to + * read-only memslots have emulated MMIO, not page fault, semantics, + * and KVM doesn't allow emulated MMIO for private memory. + */ + if (kvm_arch_has_readonly_mem(kvm) && + !(mem->flags & KVM_MEM_GUEST_MEMFD)) + valid_flags |= KVM_MEM_READONLY; if (mem->flags & ~valid_flags) return -EINVAL; @@ -870,271 +1620,596 @@ static int check_memory_region_flags(const struct kvm_userspace_memory_region *m return 0; } -static struct kvm_memslots *install_new_memslots(struct kvm *kvm, - int as_id, struct kvm_memslots *slots) +static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) { - struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); + struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); + + /* Grab the generation from the activate memslots. */ + u64 gen = __kvm_memslots(kvm, as_id)->generation; + + WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; /* - * Set the low bit in the generation, which disables SPTE caching - * until the end of synchronize_srcu_expedited. + * Do not store the new memslots while there are invalidations in + * progress, otherwise the locking in invalidate_range_start and + * invalidate_range_end will be unbalanced. */ - WARN_ON(old_memslots->generation & 1); - slots->generation = old_memslots->generation + 1; - + spin_lock(&kvm->mn_invalidate_lock); + prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); + while (kvm->mn_active_invalidate_count) { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&kvm->mn_invalidate_lock); + schedule(); + spin_lock(&kvm->mn_invalidate_lock); + } + finish_rcuwait(&kvm->mn_memslots_update_rcuwait); rcu_assign_pointer(kvm->memslots[as_id], slots); + spin_unlock(&kvm->mn_invalidate_lock); + + /* + * Acquired in kvm_set_memslot. Must be released before synchronize + * SRCU below in order to avoid deadlock with another thread + * acquiring the slots_arch_lock in an srcu critical section. + */ + mutex_unlock(&kvm->slots_arch_lock); + synchronize_srcu_expedited(&kvm->srcu); /* - * Increment the new memslot generation a second time. This prevents - * vm exits that race with memslot updates from caching a memslot - * generation that will (potentially) be valid forever. - * + * Increment the new memslot generation a second time, dropping the + * update in-progress flag and incrementing the generation based on + * the number of address spaces. This provides a unique and easily + * identifiable generation number while the memslots are in flux. + */ + gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; + + /* * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address - * space 0 will use generations 0, 4, 8, ... while * address space 1 will - * use generations 2, 6, 10, 14, ... + * space 0 will use generations 0, 2, 4, ... while address space 1 will + * use generations 1, 3, 5, ... + */ + gen += kvm_arch_nr_memslot_as_ids(kvm); + + kvm_arch_memslots_updated(kvm, gen); + + slots->generation = gen; +} + +static int kvm_prepare_memory_region(struct kvm *kvm, + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + int r; + + /* + * If dirty logging is disabled, nullify the bitmap; the old bitmap + * will be freed on "commit". If logging is enabled in both old and + * new, reuse the existing bitmap. If logging is enabled only in the + * new and KVM isn't using a ring buffer, allocate and initialize a + * new bitmap. */ - slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1; + if (change != KVM_MR_DELETE) { + if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + new->dirty_bitmap = NULL; + else if (old && old->dirty_bitmap) + new->dirty_bitmap = old->dirty_bitmap; + else if (kvm_use_dirty_bitmap(kvm)) { + r = kvm_alloc_dirty_bitmap(new); + if (r) + return r; + + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + bitmap_set(new->dirty_bitmap, 0, new->npages); + } + } + + r = kvm_arch_prepare_memory_region(kvm, old, new, change); - kvm_arch_memslots_updated(kvm, slots); + /* Free the bitmap on failure if it was allocated above. */ + if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap)) + kvm_destroy_dirty_bitmap(new); - return old_memslots; + return r; +} + +static void kvm_commit_memory_region(struct kvm *kvm, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + int old_flags = old ? old->flags : 0; + int new_flags = new ? new->flags : 0; + /* + * Update the total number of memslot pages before calling the arch + * hook so that architectures can consume the result directly. + */ + if (change == KVM_MR_DELETE) + kvm->nr_memslot_pages -= old->npages; + else if (change == KVM_MR_CREATE) + kvm->nr_memslot_pages += new->npages; + + if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) { + int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1; + atomic_set(&kvm->nr_memslots_dirty_logging, + atomic_read(&kvm->nr_memslots_dirty_logging) + change); + } + + kvm_arch_commit_memory_region(kvm, old, new, change); + + switch (change) { + case KVM_MR_CREATE: + /* Nothing more to do. */ + break; + case KVM_MR_DELETE: + /* Free the old memslot and all its metadata. */ + kvm_free_memslot(kvm, old); + break; + case KVM_MR_MOVE: + case KVM_MR_FLAGS_ONLY: + /* + * Free the dirty bitmap as needed; the below check encompasses + * both the flags and whether a ring buffer is being used) + */ + if (old->dirty_bitmap && !new->dirty_bitmap) + kvm_destroy_dirty_bitmap(old); + + /* + * The final quirk. Free the detached, old slot, but only its + * memory, not any metadata. Metadata, including arch specific + * data, may be reused by @new. + */ + kfree(old); + break; + default: + BUG(); + } } /* - * Allocate some memory and give it an address in the guest physical address - * space. - * - * Discontiguous memory is allowed, mostly for framebuffers. + * Activate @new, which must be installed in the inactive slots by the caller, + * by swapping the active slots and then propagating @new to @old once @old is + * unreachable and can be safely modified. * - * Must be called holding kvm->slots_lock for write. + * With NULL @old this simply adds @new to @active (while swapping the sets). + * With NULL @new this simply removes @old from @active and frees it + * (while also swapping the sets). */ -int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) +static void kvm_activate_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) +{ + int as_id = kvm_memslots_get_as_id(old, new); + + kvm_swap_active_memslots(kvm, as_id); + + /* Propagate the new memslot to the now inactive memslots. */ + kvm_replace_memslot(kvm, old, new); +} + +static void kvm_copy_memslot(struct kvm_memory_slot *dest, + const struct kvm_memory_slot *src) { + dest->base_gfn = src->base_gfn; + dest->npages = src->npages; + dest->dirty_bitmap = src->dirty_bitmap; + dest->arch = src->arch; + dest->userspace_addr = src->userspace_addr; + dest->flags = src->flags; + dest->id = src->id; + dest->as_id = src->as_id; +} + +static void kvm_invalidate_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *invalid_slot) +{ + /* + * Mark the current slot INVALID. As with all memslot modifications, + * this must be done on an unreachable slot to avoid modifying the + * current slot in the active tree. + */ + kvm_copy_memslot(invalid_slot, old); + invalid_slot->flags |= KVM_MEMSLOT_INVALID; + kvm_replace_memslot(kvm, old, invalid_slot); + + /* + * Activate the slot that is now marked INVALID, but don't propagate + * the slot to the now inactive slots. The slot is either going to be + * deleted or recreated as a new slot. + */ + kvm_swap_active_memslots(kvm, old->as_id); + + /* + * From this point no new shadow pages pointing to a deleted, or moved, + * memslot will be created. Validation of sp->gfn happens in: + * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) + * - kvm_is_visible_gfn (mmu_check_root) + */ + kvm_arch_flush_shadow_memslot(kvm, old); + kvm_arch_guest_memory_reclaimed(kvm); + + /* Was released by kvm_swap_active_memslots(), reacquire. */ + mutex_lock(&kvm->slots_arch_lock); + + /* + * Copy the arch-specific field of the newly-installed slot back to the + * old slot as the arch data could have changed between releasing + * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock + * above. Writers are required to retrieve memslots *after* acquiring + * slots_arch_lock, thus the active slot's data is guaranteed to be fresh. + */ + old->arch = invalid_slot->arch; +} + +static void kvm_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *new) +{ + /* Add the new memslot to the inactive set and activate. */ + kvm_replace_memslot(kvm, NULL, new); + kvm_activate_memslot(kvm, NULL, new); +} + +static void kvm_delete_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *invalid_slot) +{ + /* + * Remove the old memslot (in the inactive memslots) by passing NULL as + * the "new" slot, and for the invalid version in the active slots. + */ + kvm_replace_memslot(kvm, old, NULL); + kvm_activate_memslot(kvm, invalid_slot, NULL); +} + +static void kvm_move_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + struct kvm_memory_slot *invalid_slot) +{ + /* + * Replace the old memslot in the inactive slots, and then swap slots + * and replace the current INVALID with the new as well. + */ + kvm_replace_memslot(kvm, old, new); + kvm_activate_memslot(kvm, invalid_slot, new); +} + +static void kvm_update_flags_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) +{ + /* + * Similar to the MOVE case, but the slot doesn't need to be zapped as + * an intermediate step. Instead, the old memslot is simply replaced + * with a new, updated copy in both memslot sets. + */ + kvm_replace_memslot(kvm, old, new); + kvm_activate_memslot(kvm, old, new); +} + +static int kvm_set_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + struct kvm_memory_slot *invalid_slot; int r; - gfn_t base_gfn; + + /* + * Released in kvm_swap_active_memslots(). + * + * Must be held from before the current memslots are copied until after + * the new memslots are installed with rcu_assign_pointer, then + * released before the synchronize srcu in kvm_swap_active_memslots(). + * + * When modifying memslots outside of the slots_lock, must be held + * before reading the pointer to the current memslots until after all + * changes to those memslots are complete. + * + * These rules ensure that installing new memslots does not lose + * changes made to the previous memslots. + */ + mutex_lock(&kvm->slots_arch_lock); + + /* + * Invalidate the old slot if it's being deleted or moved. This is + * done prior to actually deleting/moving the memslot to allow vCPUs to + * continue running by ensuring there are no mappings or shadow pages + * for the memslot when it is deleted/moved. Without pre-invalidation + * (and without a lock), a window would exist between effecting the + * delete/move and committing the changes in arch code where KVM or a + * guest could access a non-existent memslot. + * + * Modifications are done on a temporary, unreachable slot. The old + * slot needs to be preserved in case a later step fails and the + * invalidation needs to be reverted. + */ + if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { + invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT); + if (!invalid_slot) { + mutex_unlock(&kvm->slots_arch_lock); + return -ENOMEM; + } + kvm_invalidate_memslot(kvm, old, invalid_slot); + } + + r = kvm_prepare_memory_region(kvm, old, new, change); + if (r) { + /* + * For DELETE/MOVE, revert the above INVALID change. No + * modifications required since the original slot was preserved + * in the inactive slots. Changing the active memslots also + * release slots_arch_lock. + */ + if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { + kvm_activate_memslot(kvm, invalid_slot, old); + kfree(invalid_slot); + } else { + mutex_unlock(&kvm->slots_arch_lock); + } + return r; + } + + /* + * For DELETE and MOVE, the working slot is now active as the INVALID + * version of the old slot. MOVE is particularly special as it reuses + * the old slot and returns a copy of the old slot (in working_slot). + * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the + * old slot is detached but otherwise preserved. + */ + if (change == KVM_MR_CREATE) + kvm_create_memslot(kvm, new); + else if (change == KVM_MR_DELETE) + kvm_delete_memslot(kvm, old, invalid_slot); + else if (change == KVM_MR_MOVE) + kvm_move_memslot(kvm, old, new, invalid_slot); + else if (change == KVM_MR_FLAGS_ONLY) + kvm_update_flags_memslot(kvm, old, new); + else + BUG(); + + /* Free the temporary INVALID slot used for DELETE and MOVE. */ + if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) + kfree(invalid_slot); + + /* + * No need to refresh new->arch, changes after dropping slots_arch_lock + * will directly hit the final, active memslot. Architectures are + * responsible for knowing that new->arch may be stale. + */ + kvm_commit_memory_region(kvm, old, new, change); + + return 0; +} + +static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, + gfn_t start, gfn_t end) +{ + struct kvm_memslot_iter iter; + + kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) { + if (iter.slot->id != id) + return true; + } + + return false; +} + +static int kvm_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem) +{ + struct kvm_memory_slot *old, *new; + struct kvm_memslots *slots; + enum kvm_mr_change change; unsigned long npages; - struct kvm_memory_slot *slot; - struct kvm_memory_slot old, new; - struct kvm_memslots *slots = NULL, *old_memslots; + gfn_t base_gfn; int as_id, id; - enum kvm_mr_change change; + int r; + + lockdep_assert_held(&kvm->slots_lock); - r = check_memory_region_flags(mem); + r = check_memory_region_flags(kvm, mem); if (r) - goto out; + return r; - r = -EINVAL; as_id = mem->slot >> 16; id = (u16)mem->slot; /* General sanity checks */ - if (mem->memory_size & (PAGE_SIZE - 1)) - goto out; + if ((mem->memory_size & (PAGE_SIZE - 1)) || + (mem->memory_size != (unsigned long)mem->memory_size)) + return -EINVAL; if (mem->guest_phys_addr & (PAGE_SIZE - 1)) - goto out; + return -EINVAL; /* We can read the guest memory with __xxx_user() later on. */ - if ((id < KVM_USER_MEM_SLOTS) && - ((mem->userspace_addr & (PAGE_SIZE - 1)) || + if ((mem->userspace_addr & (PAGE_SIZE - 1)) || + (mem->userspace_addr != untagged_addr(mem->userspace_addr)) || !access_ok((void __user *)(unsigned long)mem->userspace_addr, - mem->memory_size))) - goto out; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) - goto out; + mem->memory_size)) + return -EINVAL; + if (mem->flags & KVM_MEM_GUEST_MEMFD && + (mem->guest_memfd_offset & (PAGE_SIZE - 1) || + mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) + return -EINVAL; + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM) + return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) - goto out; - - slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); - base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; - npages = mem->memory_size >> PAGE_SHIFT; - - if (npages > KVM_MEM_MAX_NR_PAGES) - goto out; + return -EINVAL; - new = old = *slot; + /* + * The size of userspace-defined memory regions is restricted in order + * to play nice with dirty bitmap operations, which are indexed with an + * "unsigned int". KVM's internal memory regions don't support dirty + * logging, and so are exempt. + */ + if (id < KVM_USER_MEM_SLOTS && + (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) + return -EINVAL; - new.id = id; - new.base_gfn = base_gfn; - new.npages = npages; - new.flags = mem->flags; + slots = __kvm_memslots(kvm, as_id); - if (npages) { - if (!old.npages) - change = KVM_MR_CREATE; - else { /* Modify an existing slot. */ - if ((mem->userspace_addr != old.userspace_addr) || - (npages != old.npages) || - ((new.flags ^ old.flags) & KVM_MEM_READONLY)) - goto out; + /* + * Note, the old memslot (and the pointer itself!) may be invalidated + * and/or destroyed by kvm_set_memslot(). + */ + old = id_to_memslot(slots, id); - if (base_gfn != old.base_gfn) - change = KVM_MR_MOVE; - else if (new.flags != old.flags) - change = KVM_MR_FLAGS_ONLY; - else { /* Nothing to change. */ - r = 0; - goto out; - } - } - } else { - if (!old.npages) - goto out; + if (!mem->memory_size) { + if (!old || !old->npages) + return -EINVAL; - change = KVM_MR_DELETE; - new.base_gfn = 0; - new.flags = 0; - } + if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages)) + return -EIO; - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { - /* Check for overlaps */ - r = -EEXIST; - kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { - if (slot->id == id) - continue; - if (!((base_gfn + npages <= slot->base_gfn) || - (base_gfn >= slot->base_gfn + slot->npages))) - goto out; - } + return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE); } - /* Free page dirty bitmap if unneeded */ - if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) - new.dirty_bitmap = NULL; + base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT); + npages = (mem->memory_size >> PAGE_SHIFT); - r = -ENOMEM; - if (change == KVM_MR_CREATE) { - new.userspace_addr = mem->userspace_addr; + if (!old || !old->npages) { + change = KVM_MR_CREATE; - if (kvm_arch_create_memslot(kvm, &new, npages)) - goto out_free; - } + /* + * To simplify KVM internals, the total number of pages across + * all memslots must fit in an unsigned long. + */ + if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) + return -EINVAL; + } else { /* Modify an existing slot. */ + /* Private memslots are immutable, they can only be deleted. */ + if (mem->flags & KVM_MEM_GUEST_MEMFD) + return -EINVAL; + if ((mem->userspace_addr != old->userspace_addr) || + (npages != old->npages) || + ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) + return -EINVAL; - /* Allocate page dirty bitmap if needed */ - if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - if (kvm_create_dirty_bitmap(&new) < 0) - goto out_free; + if (base_gfn != old->base_gfn) + change = KVM_MR_MOVE; + else if (mem->flags != old->flags) + change = KVM_MR_FLAGS_ONLY; + else /* Nothing to change. */ + return 0; } - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); - if (!slots) - goto out_free; - memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); - - if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { - slot = id_to_memslot(slots, id); - slot->flags |= KVM_MEMSLOT_INVALID; - - old_memslots = install_new_memslots(kvm, as_id, slots); + if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) && + kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages)) + return -EEXIST; - /* From this point no new shadow pages pointing to a deleted, - * or moved, memslot will be created. - * - * validation of sp->gfn happens in: - * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) - * - kvm_is_visible_gfn (mmu_check_roots) - */ - kvm_arch_flush_shadow_memslot(kvm, slot); + /* Allocate a slot that will persist in the memslot. */ + new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; - /* - * We can re-use the old_memslots from above, the only difference - * from the currently installed memslots is the invalid flag. This - * will get overwritten by update_memslots anyway. - */ - slots = old_memslots; + new->as_id = as_id; + new->id = id; + new->base_gfn = base_gfn; + new->npages = npages; + new->flags = mem->flags; + new->userspace_addr = mem->userspace_addr; + if (mem->flags & KVM_MEM_GUEST_MEMFD) { + r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset); + if (r) + goto out; } - r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); + r = kvm_set_memslot(kvm, old, new, change); if (r) - goto out_slots; - - /* actual memory is freed via old in kvm_free_memslot below */ - if (change == KVM_MR_DELETE) { - new.dirty_bitmap = NULL; - memset(&new.arch, 0, sizeof(new.arch)); - } - - update_memslots(slots, &new, change); - old_memslots = install_new_memslots(kvm, as_id, slots); + goto out_unbind; - kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); - - kvm_free_memslot(kvm, &old, &new); - kvfree(old_memslots); return 0; -out_slots: - kvfree(slots); -out_free: - kvm_free_memslot(kvm, &new, &old); +out_unbind: + if (mem->flags & KVM_MEM_GUEST_MEMFD) + kvm_gmem_unbind(new); out: + kfree(new); return r; } -EXPORT_SYMBOL_GPL(__kvm_set_memory_region); -int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) +int kvm_set_internal_memslot(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem) { - int r; + if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS)) + return -EINVAL; - mutex_lock(&kvm->slots_lock); - r = __kvm_set_memory_region(kvm, mem); - mutex_unlock(&kvm->slots_lock); - return r; + if (WARN_ON_ONCE(mem->flags)) + return -EINVAL; + + return kvm_set_memory_region(kvm, mem); } -EXPORT_SYMBOL_GPL(kvm_set_memory_region); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_internal_memslot); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region2 *mem) { if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; + guard(mutex)(&kvm->slots_lock); return kvm_set_memory_region(kvm, mem); } -int kvm_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log, int *is_dirty) +#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT +/** + * kvm_get_dirty_log - get a snapshot of dirty pages + * @kvm: pointer to kvm instance + * @log: slot id and address to which we copy the log + * @is_dirty: set to '1' if any dirty pages were found + * @memslot: set to the associated memslot, always valid on success + */ +int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, + int *is_dirty, struct kvm_memory_slot **memslot) { struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; int i, as_id, id; unsigned long n; unsigned long any = 0; + /* Dirty ring tracking may be exclusive to dirty log tracking */ + if (!kvm_use_dirty_bitmap(kvm)) + return -ENXIO; + + *memslot = NULL; + *is_dirty = 0; + as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); - memslot = id_to_memslot(slots, id); - if (!memslot->dirty_bitmap) + *memslot = id_to_memslot(slots, id); + if (!(*memslot) || !(*memslot)->dirty_bitmap) return -ENOENT; - n = kvm_dirty_bitmap_bytes(memslot); + kvm_arch_sync_dirty_log(kvm, *memslot); + + n = kvm_dirty_bitmap_bytes(*memslot); for (i = 0; !any && i < n/sizeof(long); ++i) - any = memslot->dirty_bitmap[i]; + any = (*memslot)->dirty_bitmap[i]; - if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) return -EFAULT; if (any) *is_dirty = 1; return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dirty_log); -#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT +#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ /** - * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages + * kvm_get_dirty_log_protect - get a snapshot of dirty pages * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log - * @is_dirty: flag set if any page is dirty * * We need to keep it in mind that VCPU threads can write to the bitmap * concurrently. So, to avoid losing track of dirty pages we keep the @@ -1151,8 +2226,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); * exiting to userspace will be logged for the next call. * */ -int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *flush) +static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1160,21 +2234,28 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, unsigned long n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; + bool flush; + + /* Dirty ring tracking may be exclusive to dirty log tracking */ + if (!kvm_use_dirty_bitmap(kvm)) + return -ENXIO; as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); + if (!memslot || !memslot->dirty_bitmap) + return -ENOENT; dirty_bitmap = memslot->dirty_bitmap; - if (!dirty_bitmap) - return -ENOENT; + + kvm_arch_sync_dirty_log(kvm, memslot); n = kvm_dirty_bitmap_bytes(memslot); - *flush = false; + flush = false; if (kvm->manual_dirty_log_protect) { /* * Unlike kvm_get_dirty_log, we always return false in *flush, @@ -1189,7 +2270,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); memset(dirty_bitmap_buffer, 0, n); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; @@ -1197,24 +2278,57 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, if (!dirty_bitmap[i]) continue; - *flush = true; + flush = true; mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; - if (mask) { - offset = i * BITS_PER_LONG; - kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, - offset, mask); - } + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); } - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); } + if (flush) + kvm_flush_remote_tlbs_memslot(kvm, memslot); + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); + + +/** + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log + * + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. + */ +static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log_protect(kvm, log); + + mutex_unlock(&kvm->slots_lock); + return r; +} /** * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap @@ -1222,8 +2336,8 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); * @kvm: pointer to kvm instance * @log: slot id and address from which to fetch the bitmap of dirty pages */ -int kvm_clear_dirty_log_protect(struct kvm *kvm, - struct kvm_clear_dirty_log *log, bool *flush) +static int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1232,36 +2346,44 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, unsigned long i, n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; + bool flush; + + /* Dirty ring tracking may be exclusive to dirty log tracking */ + if (!kvm_use_dirty_bitmap(kvm)) + return -ENXIO; as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; - if ((log->first_page & 63) || (log->num_pages & 63)) + if (log->first_page & 63) return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); + if (!memslot || !memslot->dirty_bitmap) + return -ENOENT; dirty_bitmap = memslot->dirty_bitmap; - if (!dirty_bitmap) - return -ENOENT; - n = kvm_dirty_bitmap_bytes(memslot); + n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; if (log->first_page > memslot->npages || - log->num_pages > memslot->npages - log->first_page) - return -EINVAL; + log->num_pages > memslot->npages - log->first_page || + (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) + return -EINVAL; - *flush = false; + kvm_arch_sync_dirty_log(kvm, memslot); + + flush = false; dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) return -EFAULT; - spin_lock(&kvm->mmu_lock); - for (offset = log->first_page, - i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; + KVM_MMU_LOCK(kvm); + for (offset = log->first_page, i = offset / BITS_PER_LONG, + n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; i++, offset += BITS_PER_LONG) { unsigned long mask = *dirty_bitmap_buffer++; atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; @@ -1277,64 +2399,308 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, * a problem if userspace sets them in log->dirty_bitmap. */ if (mask) { - *flush = true; + flush = true; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } } - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); + + if (flush) + kvm_flush_remote_tlbs_memslot(kvm, memslot); return 0; } -EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); -#endif -bool kvm_largepages_enabled(void) +static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, + struct kvm_clear_dirty_log *log) { - return largepages_enabled; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_clear_dirty_log_protect(kvm, log); + + mutex_unlock(&kvm->slots_lock); + return r; } +#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ -void kvm_disable_largepages(void) +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static u64 kvm_supported_mem_attributes(struct kvm *kvm) { - largepages_enabled = false; + if (!kvm || kvm_arch_has_private_mem(kvm)) + return KVM_MEMORY_ATTRIBUTE_PRIVATE; + + return 0; } -EXPORT_SYMBOL_GPL(kvm_disable_largepages); + +/* + * Returns true if _all_ gfns in the range [@start, @end) have attributes + * such that the bits in @mask match @attrs. + */ +bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, + unsigned long mask, unsigned long attrs) +{ + XA_STATE(xas, &kvm->mem_attr_array, start); + unsigned long index; + void *entry; + + mask &= kvm_supported_mem_attributes(kvm); + if (attrs & ~mask) + return false; + + if (end == start + 1) + return (kvm_get_memory_attributes(kvm, start) & mask) == attrs; + + guard(rcu)(); + if (!attrs) + return !xas_find(&xas, end - 1); + + for (index = start; index < end; index++) { + do { + entry = xas_next(&xas); + } while (xas_retry(&xas, entry)); + + if (xas.xa_index != index || + (xa_to_value(entry) & mask) != attrs) + return false; + } + + return true; +} + +static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, + struct kvm_mmu_notifier_range *range) +{ + struct kvm_gfn_range gfn_range; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + struct kvm_memslot_iter iter; + bool found_memslot = false; + bool ret = false; + int i; + + gfn_range.arg = range->arg; + gfn_range.may_block = range->may_block; + + /* + * If/when KVM supports more attributes beyond private .vs shared, this + * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target + * range already has the desired private vs. shared state (it's unclear + * if that is a net win). For now, KVM reaches this point if and only + * if the private flag is being toggled, i.e. all mappings are in play. + */ + + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { + slots = __kvm_memslots(kvm, i); + + kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) { + slot = iter.slot; + gfn_range.slot = slot; + + gfn_range.start = max(range->start, slot->base_gfn); + gfn_range.end = min(range->end, slot->base_gfn + slot->npages); + if (gfn_range.start >= gfn_range.end) + continue; + + if (!found_memslot) { + found_memslot = true; + KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm); + } + + ret |= range->handler(kvm, &gfn_range); + } + } + + if (range->flush_on_ret && ret) + kvm_flush_remote_tlbs(kvm); + + if (found_memslot) + KVM_MMU_UNLOCK(kvm); +} + +static bool kvm_pre_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + /* + * Unconditionally add the range to the invalidation set, regardless of + * whether or not the arch callback actually needs to zap SPTEs. E.g. + * if KVM supports RWX attributes in the future and the attributes are + * going from R=>RW, zapping isn't strictly necessary. Unconditionally + * adding the range allows KVM to require that MMU invalidations add at + * least one range between begin() and end(), e.g. allows KVM to detect + * bugs where the add() is missed. Relaxing the rule *might* be safe, + * but it's not obvious that allowing new mappings while the attributes + * are in flux is desirable or worth the complexity. + */ + kvm_mmu_invalidate_range_add(kvm, range->start, range->end); + + return kvm_arch_pre_set_memory_attributes(kvm, range); +} + +/* Set @attributes for the gfn range [@start, @end). */ +static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, + unsigned long attributes) +{ + struct kvm_mmu_notifier_range pre_set_range = { + .start = start, + .end = end, + .arg.attributes = attributes, + .handler = kvm_pre_set_memory_attributes, + .on_lock = kvm_mmu_invalidate_begin, + .flush_on_ret = true, + .may_block = true, + }; + struct kvm_mmu_notifier_range post_set_range = { + .start = start, + .end = end, + .arg.attributes = attributes, + .handler = kvm_arch_post_set_memory_attributes, + .on_lock = kvm_mmu_invalidate_end, + .may_block = true, + }; + unsigned long i; + void *entry; + int r = 0; + + entry = attributes ? xa_mk_value(attributes) : NULL; + + trace_kvm_vm_set_mem_attributes(start, end, attributes); + + mutex_lock(&kvm->slots_lock); + + /* Nothing to do if the entire range has the desired attributes. */ + if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes)) + goto out_unlock; + + /* + * Reserve memory ahead of time to avoid having to deal with failures + * partway through setting the new attributes. + */ + for (i = start; i < end; i++) { + r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT); + if (r) + goto out_unlock; + + cond_resched(); + } + + kvm_handle_gfn_range(kvm, &pre_set_range); + + for (i = start; i < end; i++) { + r = xa_err(xa_store(&kvm->mem_attr_array, i, entry, + GFP_KERNEL_ACCOUNT)); + KVM_BUG_ON(r, kvm); + cond_resched(); + } + + kvm_handle_gfn_range(kvm, &post_set_range); + +out_unlock: + mutex_unlock(&kvm->slots_lock); + + return r; +} +static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, + struct kvm_memory_attributes *attrs) +{ + gfn_t start, end; + + /* flags is currently not used. */ + if (attrs->flags) + return -EINVAL; + if (attrs->attributes & ~kvm_supported_mem_attributes(kvm)) + return -EINVAL; + if (attrs->size == 0 || attrs->address + attrs->size < attrs->address) + return -EINVAL; + if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size)) + return -EINVAL; + + start = attrs->address >> PAGE_SHIFT; + end = (attrs->address + attrs->size) >> PAGE_SHIFT; + + /* + * xarray tracks data using "unsigned long", and as a result so does + * KVM. For simplicity, supports generic attributes only on 64-bit + * architectures. + */ + BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long)); + + return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes); +} +#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { - return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); + struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + u64 gen = slots->generation; + struct kvm_memory_slot *slot; + + /* + * This also protects against using a memslot from a different address space, + * since different address spaces have different generation numbers. + */ + if (unlikely(gen != vcpu->last_used_slot_gen)) { + vcpu->last_used_slot = NULL; + vcpu->last_used_slot_gen = gen; + } + + slot = try_get_memslot(vcpu->last_used_slot, gfn); + if (slot) + return slot; + + /* + * Fall back to searching all memslots. We purposely use + * search_memslots() instead of __gfn_to_memslot() to avoid + * thrashing the VM-wide last_used_slot in kvm_memslots. + */ + slot = search_memslots(slots, gfn, false); + if (slot) { + vcpu->last_used_slot = slot; + return slot; + } + + return NULL; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_memslot); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); - if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || - memslot->flags & KVM_MEMSLOT_INVALID) - return false; + return kvm_is_visible_memslot(memslot); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_visible_gfn); - return true; +bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + + return kvm_is_visible_memslot(memslot); } -EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_visible_gfn); -unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr, size; size = PAGE_SIZE; - addr = gfn_to_hva(kvm, gfn); + addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); if (kvm_is_error_hva(addr)) return PAGE_SIZE; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); vma = find_vma(current->mm, addr); if (!vma) goto out; @@ -1342,17 +2708,17 @@ unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) size = vma_kernel_pagesize(vma); out: - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); return size; } -static bool memslot_is_readonly(struct kvm_memory_slot *slot) +static bool memslot_is_readonly(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_READONLY; } -static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, +static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages, bool write) { if (!slot || slot->flags & KVM_MEMSLOT_INVALID) @@ -1378,19 +2744,19 @@ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, { return gfn_to_hva_many(slot, gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva_memslot); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_hva); /* * Return the hva of a @gfn and the R/W attribute if possible. @@ -1425,39 +2791,93 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } -static inline int check_user_page_hwpoison(unsigned long addr) +static bool kvm_is_ad_tracked_page(struct page *page) +{ + /* + * Per page-flags.h, pages tagged PG_reserved "should in general not be + * touched (e.g. set dirty) except by its owner". + */ + return !PageReserved(page); +} + +static void kvm_set_page_dirty(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + SetPageDirty(page); +} + +static void kvm_set_page_accessed(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + mark_page_accessed(page); +} + +void kvm_release_page_clean(struct page *page) +{ + if (!page) + return; + + kvm_set_page_accessed(page); + put_page(page); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_clean); + +void kvm_release_page_dirty(struct page *page) +{ + if (!page) + return; + + kvm_set_page_dirty(page); + kvm_release_page_clean(page); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_dirty); + +static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, + struct follow_pfnmap_args *map, bool writable) { - int rc, flags = FOLL_HWPOISON | FOLL_WRITE; + kvm_pfn_t pfn; + + WARN_ON_ONCE(!!page == !!map); - rc = get_user_pages(addr, 1, flags, NULL, NULL); - return rc == -EHWPOISON; + if (kfp->map_writable) + *kfp->map_writable = writable; + + if (map) + pfn = map->pfn; + else + pfn = page_to_pfn(page); + + *kfp->refcounted_page = page; + + return pfn; } /* * The fast path to get the writable pfn which will be stored in @pfn, - * true indicates success, otherwise false is returned. It's also the - * only part that runs if we can are in atomic context. + * true indicates success, otherwise false is returned. */ -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, - bool *writable, kvm_pfn_t *pfn) +static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { - struct page *page[1]; - int npages; + struct page *page; + bool r; /* - * Fast pin a writable pfn only if it is a write fault request - * or the caller allows to map a writable pfn for a read fault - * request. + * Try the fast-only path when the caller wants to pin/get the page for + * writing. If the caller only wants to read the page, KVM must go + * down the full, slow path in order to avoid racing an operation that + * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing + * at the old, read-only page while mm/ points at a new, writable page. */ - if (!(write_fault || writable)) + if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; - npages = __get_user_pages_fast(addr, 1, 1, page); - if (npages == 1) { - *pfn = page_to_pfn(page[0]); + if (kfp->pin) + r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; + else + r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); - if (writable) - *writable = true; + if (r) { + *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } @@ -1468,38 +2888,48 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, - bool *writable, kvm_pfn_t *pfn) +static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { - unsigned int flags = FOLL_HWPOISON; - struct page *page; - int npages = 0; - - might_sleep(); - - if (writable) - *writable = write_fault; - - if (write_fault) - flags |= FOLL_WRITE; - if (async) - flags |= FOLL_NOWAIT; + /* + * When a VCPU accesses a page that is not mapped into the secondary + * MMU, we lookup the page using GUP to map it, so the guest VCPU can + * make progress. We always want to honor NUMA hinting faults in that + * case, because GUP usage corresponds to memory accesses from the VCPU. + * Otherwise, we'd not trigger NUMA hinting faults once a page is + * mapped into the secondary MMU and gets accessed by a VCPU. + * + * Note that get_user_page_fast_only() and FOLL_WRITE for now + * implicitly honor NUMA hinting faults and don't need this flag. + */ + unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags; + struct page *page, *wpage; + int npages; - npages = get_user_pages_unlocked(addr, 1, &page, flags); + if (kfp->pin) + npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); + else + npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; - /* map read fault as writable if possible */ - if (unlikely(!write_fault) && writable) { - struct page *wpage; + /* + * Pinning is mutually exclusive with opportunistically mapping a read + * fault as writable, as KVM should never pin pages when mapping memory + * into the guest (pinning is only for direct accesses from KVM). + */ + if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) + goto out; - if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { - *writable = true; - put_page(page); - page = wpage; - } + /* map read fault as writable if possible */ + if (!(flags & FOLL_WRITE) && kfp->map_writable && + get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { + put_page(page); + page = wpage; + flags |= FOLL_WRITE; } - *pfn = page_to_pfn(page); + +out: + *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } @@ -1515,21 +2945,28 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) } static int hva_to_pfn_remapped(struct vm_area_struct *vma, - unsigned long addr, bool *async, - bool write_fault, bool *writable, - kvm_pfn_t *p_pfn) + struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { - unsigned long pfn; + struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva }; + bool write_fault = kfp->flags & FOLL_WRITE; int r; - r = follow_pfn(vma, addr, &pfn); + /* + * Remapped memory cannot be pinned in any meaningful sense. Bail if + * the caller wants to pin the page, i.e. access the page outside of + * MMU notifier protection, and unsafe umappings are disallowed. + */ + if (kfp->pin && !allow_unsafe_mappings) + return -EINVAL; + + r = follow_pfnmap_start(&args); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does * not call the fault handler, so do it here. */ bool unlocked = false; - r = fixup_user_fault(current, current->mm, addr, + r = fixup_user_fault(current->mm, kfp->hva, (write_fault ? FAULT_FLAG_WRITE : 0), &unlocked); if (unlocked) @@ -1537,169 +2974,110 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, if (r) return r; - r = follow_pfn(vma, addr, &pfn); + r = follow_pfnmap_start(&args); if (r) return r; - } - if (writable) - *writable = true; - - /* - * Get a reference here because callers of *hva_to_pfn* and - * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the - * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will - * simply do nothing for reserved pfns. - * - * Whoever called remap_pfn_range is also going to call e.g. - * unmap_mapping_range before the underlying pages are freed, - * causing a call to our MMU notifier. - */ - kvm_get_pfn(pfn); + if (write_fault && !args.writable) { + *p_pfn = KVM_PFN_ERR_RO_FAULT; + goto out; + } - *p_pfn = pfn; - return 0; + *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable); +out: + follow_pfnmap_end(&args); + return r; } -/* - * Pin guest page in memory and return its pfn. - * @addr: host virtual address which maps memory to the guest - * @atomic: whether this function can sleep - * @async: whether this function need to wait IO complete if the - * host page is not in the memory - * @write_fault: whether we should get a writable host page - * @writable: whether it allows to map a writable host page for !@write_fault - * - * The function will map a writable host page for these two cases: - * 1): @write_fault = true - * 2): @write_fault = false && @writable, @writable will tell the caller - * whether the mapping is writable. - */ -static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, - bool write_fault, bool *writable) +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) { struct vm_area_struct *vma; - kvm_pfn_t pfn = 0; + kvm_pfn_t pfn; int npages, r; - /* we can do it either atomically or asynchronously, not both */ - BUG_ON(atomic && async); - - if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) - return pfn; + might_sleep(); - if (atomic) + if (WARN_ON_ONCE(!kfp->refcounted_page)) return KVM_PFN_ERR_FAULT; - npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); - if (npages == 1) + if (hva_to_pfn_fast(kfp, &pfn)) return pfn; - down_read(¤t->mm->mmap_sem); - if (npages == -EHWPOISON || - (!async && check_user_page_hwpoison(addr))) { - pfn = KVM_PFN_ERR_HWPOISON; - goto exit; - } + npages = hva_to_pfn_slow(kfp, &pfn); + if (npages == 1) + return pfn; + if (npages == -EINTR || npages == -EAGAIN) + return KVM_PFN_ERR_SIGPENDING; + if (npages == -EHWPOISON) + return KVM_PFN_ERR_HWPOISON; + mmap_read_lock(current->mm); retry: - vma = find_vma_intersection(current->mm, addr, addr + 1); + vma = vma_lookup(current->mm, kfp->hva); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); + r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { - if (async && vma_is_valid(vma, write_fault)) - *async = true; - pfn = KVM_PFN_ERR_FAULT; + if ((kfp->flags & FOLL_NOWAIT) && + vma_is_valid(vma, kfp->flags & FOLL_WRITE)) + pfn = KVM_PFN_ERR_NEEDS_IO; + else + pfn = KVM_PFN_ERR_FAULT; } -exit: - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); return pfn; } -kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool *async, bool write_fault, - bool *writable) +static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) { - unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL, + kfp->flags & FOLL_WRITE); - if (addr == KVM_HVA_ERR_RO_BAD) { - if (writable) - *writable = false; + if (kfp->hva == KVM_HVA_ERR_RO_BAD) return KVM_PFN_ERR_RO_FAULT; - } - if (kvm_is_error_hva(addr)) { - if (writable) - *writable = false; + if (kvm_is_error_hva(kfp->hva)) return KVM_PFN_NOSLOT; - } - /* Do not map writable pfn in the readonly memslot. */ - if (writable && memslot_is_readonly(slot)) { - *writable = false; - writable = NULL; + if (memslot_is_readonly(kfp->slot) && kfp->map_writable) { + *kfp->map_writable = false; + kfp->map_writable = NULL; } - return hva_to_pfn(addr, atomic, async, write_fault, - writable); -} -EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); - -kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable) -{ - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, - write_fault, writable); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); - -kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) -{ - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); + return hva_to_pfn(kfp); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); -kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) +kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, + unsigned int foll, bool *writable, + struct page **refcounted_page) { - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); - -kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) -{ - return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); + struct kvm_follow_pfn kfp = { + .slot = slot, + .gfn = gfn, + .flags = foll, + .map_writable = writable, + .refcounted_page = refcounted_page, + }; -kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); -} -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); + if (WARN_ON_ONCE(!writable || !refcounted_page)) + return KVM_PFN_ERR_FAULT; -kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) -{ - return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn); + *writable = false; + *refcounted_page = NULL; -kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); + return kvm_follow_pfn(&kfp); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_faultin_pfn); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages) +int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages) { unsigned long addr; gfn_t entry = 0; @@ -1711,97 +3089,93 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, if (entry < nr_pages) return 0; - return __get_user_pages_fast(addr, nr_pages, 1, pages); -} -EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); - -static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) -{ - if (is_error_noslot_pfn(pfn)) - return KVM_ERR_PTR_BAD_PAGE; - - if (kvm_is_reserved_pfn(pfn)) { - WARN_ON(1); - return KVM_ERR_PTR_BAD_PAGE; - } - - return pfn_to_page(pfn); + return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prefetch_pages); -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - kvm_pfn_t pfn; - - pfn = gfn_to_pfn(kvm, gfn); +/* + * Don't use this API unless you are absolutely, positively certain that KVM + * needs to get a struct page, e.g. to pin the page for firmware DMA. + * + * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate + * its refcount. + */ +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) +{ + struct page *refcounted_page = NULL; + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(kvm, gfn), + .gfn = gfn, + .flags = write ? FOLL_WRITE : 0, + .refcounted_page = &refcounted_page, + }; - return kvm_pfn_to_page(pfn); + (void)kvm_follow_pfn(&kfp); + return refcounted_page; } -EXPORT_SYMBOL_GPL(gfn_to_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__gfn_to_page); -struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + bool writable) { - kvm_pfn_t pfn; - - pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(vcpu->kvm, gfn), + .gfn = gfn, + .flags = writable ? FOLL_WRITE : 0, + .refcounted_page = &map->pinned_page, + .pin = true, + }; - return kvm_pfn_to_page(pfn); -} -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); + map->pinned_page = NULL; + map->page = NULL; + map->hva = NULL; + map->gfn = gfn; + map->writable = writable; -void kvm_release_page_clean(struct page *page) -{ - WARN_ON(is_error_page(page)); + map->pfn = kvm_follow_pfn(&kfp); + if (is_error_noslot_pfn(map->pfn)) + return -EINVAL; - kvm_release_pfn_clean(page_to_pfn(page)); -} -EXPORT_SYMBOL_GPL(kvm_release_page_clean); + if (pfn_valid(map->pfn)) { + map->page = pfn_to_page(map->pfn); + map->hva = kmap(map->page); +#ifdef CONFIG_HAS_IOMEM + } else { + map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB); +#endif + } -void kvm_release_pfn_clean(kvm_pfn_t pfn) -{ - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) - put_page(pfn_to_page(pfn)); + return map->hva ? 0 : -EFAULT; } -EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_map); -void kvm_release_page_dirty(struct page *page) +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { - WARN_ON(is_error_page(page)); - - kvm_release_pfn_dirty(page_to_pfn(page)); -} -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); + if (!map->hva) + return; -void kvm_release_pfn_dirty(kvm_pfn_t pfn) -{ - kvm_set_pfn_dirty(pfn); - kvm_release_pfn_clean(pfn); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); + if (map->page) + kunmap(map->page); +#ifdef CONFIG_HAS_IOMEM + else + memunmap(map->hva); +#endif -void kvm_set_pfn_dirty(kvm_pfn_t pfn) -{ - if (!kvm_is_reserved_pfn(pfn)) { - struct page *page = pfn_to_page(pfn); + if (map->writable) + kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - if (!PageReserved(page)) - SetPageDirty(page); + if (map->pinned_page) { + if (map->writable) + kvm_set_page_dirty(map->pinned_page); + kvm_set_page_accessed(map->pinned_page); + unpin_user_page(map->pinned_page); } -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); -void kvm_set_pfn_accessed(kvm_pfn_t pfn) -{ - if (!kvm_is_reserved_pfn(pfn)) - mark_page_accessed(pfn_to_page(pfn)); + map->hva = NULL; + map->page = NULL; + map->pinned_page = NULL; } -EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); - -void kvm_get_pfn(kvm_pfn_t pfn) -{ - if (!kvm_is_reserved_pfn(pfn)) - get_page(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_get_pfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_unmap); static int next_segment(unsigned long len, int offset) { @@ -1811,12 +3185,16 @@ static int next_segment(unsigned long len, int offset) return len; } +/* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, int len) { int r; unsigned long addr; + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) + return -EFAULT; + addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; @@ -1833,7 +3211,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_page); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len) @@ -1842,7 +3220,7 @@ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_page); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { @@ -1862,7 +3240,7 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) } return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { @@ -1882,7 +3260,7 @@ int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned l } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest); static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, unsigned long len) @@ -1890,6 +3268,9 @@ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, int r; unsigned long addr; + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) + return -EFAULT; + addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; @@ -1901,17 +3282,6 @@ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, return 0; } -int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, - unsigned long len) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - int offset = offset_in_page(gpa); - - return __kvm_read_guest_atomic(slot, gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); - int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { @@ -1921,21 +3291,26 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, return __kvm_read_guest_atomic(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_atomic); -static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, +/* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ +static int __kvm_write_guest_page(struct kvm *kvm, + struct kvm_memory_slot *memslot, gfn_t gfn, const void *data, int offset, int len) { int r; unsigned long addr; + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) + return -EFAULT; + addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; r = __copy_to_user((void __user *)addr + offset, data, len); if (r) return -EFAULT; - mark_page_dirty_in_slot(memslot, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); return 0; } @@ -1944,18 +3319,18 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - return __kvm_write_guest_page(slot, gfn, data, offset, len); + return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_page); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __kvm_write_guest_page(slot, gfn, data, offset, len); + return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest_page); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) @@ -1976,7 +3351,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, } return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len) @@ -1997,7 +3372,7 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest); static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, struct gfn_to_hva_cache *ghc, @@ -2008,33 +3383,36 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; gfn_t nr_pages_needed = end_gfn - start_gfn + 1; gfn_t nr_pages_avail; - int r = start_gfn <= end_gfn ? 0 : -EINVAL; - ghc->gpa = gpa; + /* Update ghc->generation before performing any error checks. */ ghc->generation = slots->generation; - ghc->len = len; - ghc->hva = KVM_HVA_ERR_BAD; + + if (start_gfn > end_gfn) { + ghc->hva = KVM_HVA_ERR_BAD; + return -EINVAL; + } /* * If the requested region crosses two memslots, we still * verify that the entire region is valid here. */ - while (!r && start_gfn <= end_gfn) { + for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { ghc->memslot = __gfn_to_memslot(slots, start_gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); if (kvm_is_error_hva(ghc->hva)) - r = -EFAULT; - start_gfn += nr_pages_avail; + return -EFAULT; } /* Use the slow path for cross page reads and writes. */ - if (!r && nr_pages_needed == 1) + if (nr_pages_needed == 1) ghc->hva += offset; else ghc->memslot = NULL; - return r; + ghc->gpa = gpa; + ghc->len = len; + return 0; } int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, @@ -2043,7 +3421,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, struct kvm_memslots *slots = kvm_memslots(kvm); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } -EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, @@ -2053,75 +3431,83 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int r; gpa_t gpa = ghc->gpa + offset; - BUG_ON(len + offset > ghc->len); - - if (slots->generation != ghc->generation) - __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); + if (WARN_ON_ONCE(len + offset > ghc->len)) + return -EINVAL; - if (unlikely(!ghc->memslot)) - return kvm_write_guest(kvm, gpa, data, len); + if (slots->generation != ghc->generation) { + if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) + return -EFAULT; + } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; + if (unlikely(!ghc->memslot)) + return kvm_write_guest(kvm, gpa, data, len); + r = __copy_to_user((void __user *)ghc->hva + offset, data, len); if (r) return -EFAULT; - mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); + mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT); return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_offset_cached); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_cached); -int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) +int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned int offset, + unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; + gpa_t gpa = ghc->gpa + offset; - BUG_ON(len > ghc->len); - - if (slots->generation != ghc->generation) - __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); + if (WARN_ON_ONCE(len + offset > ghc->len)) + return -EINVAL; - if (unlikely(!ghc->memslot)) - return kvm_read_guest(kvm, ghc->gpa, data, len); + if (slots->generation != ghc->generation) { + if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) + return -EFAULT; + } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; - r = __copy_from_user(data, (void __user *)ghc->hva, len); + if (unlikely(!ghc->memslot)) + return kvm_read_guest(kvm, gpa, data, len); + + r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); if (r) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_offset_cached); -int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) { - const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); - - return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); + return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_clear_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_cached); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) { + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { - ret = kvm_clear_guest_page(kvm, gfn, offset, seg); + ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg); if (ret < 0) return ret; offset = 0; @@ -2130,35 +3516,50 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } return 0; } -EXPORT_SYMBOL_GPL(kvm_clear_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_clear_guest); -static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, - gfn_t gfn) +void mark_page_dirty_in_slot(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + gfn_t gfn) { - if (memslot && memslot->dirty_bitmap) { + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + +#ifdef CONFIG_HAVE_KVM_DIRTY_RING + if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm)) + return; + + WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm)); +#endif + + if (memslot && kvm_slot_dirty_track_enabled(memslot)) { unsigned long rel_gfn = gfn - memslot->base_gfn; + u32 slot = (memslot->as_id << 16) | memslot->id; - set_bit_le(rel_gfn, memslot->dirty_bitmap); + if (kvm->dirty_ring_size && vcpu) + kvm_dirty_ring_push(vcpu, slot, rel_gfn); + else if (memslot->dirty_bitmap) + set_bit_le(rel_gfn, memslot->dirty_bitmap); } } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty_in_slot); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot; memslot = gfn_to_memslot(kvm, gfn); - mark_page_dirty_in_slot(memslot, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); } -EXPORT_SYMBOL_GPL(mark_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memory_slot *memslot; memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - mark_page_dirty_in_slot(memslot, gfn); + mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn); } -EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_mark_page_dirty); void kvm_sigset_activate(struct kvm_vcpu *vcpu) { @@ -2185,34 +3586,38 @@ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) { - unsigned int old, val, grow; + unsigned int old, val, grow, grow_start; old = val = vcpu->halt_poll_ns; + grow_start = READ_ONCE(halt_poll_ns_grow_start); grow = READ_ONCE(halt_poll_ns_grow); - /* 10us base */ - if (val == 0 && grow) - val = 10000; - else - val *= grow; + if (!grow) + goto out; - if (val > halt_poll_ns) - val = halt_poll_ns; + val *= grow; + if (val < grow_start) + val = grow_start; vcpu->halt_poll_ns = val; +out: trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); } static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) { - unsigned int old, val, shrink; + unsigned int old, val, shrink, grow_start; old = val = vcpu->halt_poll_ns; shrink = READ_ONCE(halt_poll_ns_shrink); + grow_start = READ_ONCE(halt_poll_ns_grow_start); if (shrink == 0) val = 0; else val /= shrink; + if (val < grow_start) + val = 0; + vcpu->halt_poll_ns = val; trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); } @@ -2222,14 +3627,14 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) int ret = -EINTR; int idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_arch_vcpu_runnable(vcpu)) { - kvm_make_request(KVM_REQ_UNHALT, vcpu); + if (kvm_arch_vcpu_runnable(vcpu)) goto out; - } if (kvm_cpu_has_pending_timer(vcpu)) goto out; if (signal_pending(current)) goto out; + if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu)) + goto out; ret = 0; out: @@ -2238,39 +3643,24 @@ out: } /* - * The vCPU has executed a HLT instruction with in-kernel mode enabled. + * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is + * pending. This is mostly used when halting a vCPU, but may also be used + * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI. */ -void kvm_vcpu_block(struct kvm_vcpu *vcpu) +bool kvm_vcpu_block(struct kvm_vcpu *vcpu) { - ktime_t start, cur; - DECLARE_SWAITQUEUE(wait); + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); bool waited = false; - u64 block_ns; - start = cur = ktime_get(); - if (vcpu->halt_poll_ns) { - ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); - - ++vcpu->stat.halt_attempted_poll; - do { - /* - * This sets KVM_REQ_UNHALT if an interrupt - * arrives. - */ - if (kvm_vcpu_check_block(vcpu) < 0) { - ++vcpu->stat.halt_successful_poll; - if (!vcpu_valid_wakeup(vcpu)) - ++vcpu->stat.halt_poll_invalid; - goto out; - } - cur = ktime_get(); - } while (single_task_running() && ktime_before(cur, stop)); - } + vcpu->stat.generic.blocking = 1; + preempt_disable(); kvm_arch_vcpu_blocking(vcpu); + prepare_to_rcuwait(wait); + preempt_enable(); for (;;) { - prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); if (kvm_vcpu_check_block(vcpu) < 0) break; @@ -2279,88 +3669,221 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) schedule(); } - finish_swait(&vcpu->wq, &wait); - cur = ktime_get(); - + preempt_disable(); + finish_rcuwait(wait); kvm_arch_vcpu_unblocking(vcpu); + preempt_enable(); + + vcpu->stat.generic.blocking = 0; + + return waited; +} + +static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start, + ktime_t end, bool success) +{ + struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic; + u64 poll_ns = ktime_to_ns(ktime_sub(end, start)); + + ++vcpu->stat.generic.halt_attempted_poll; + + if (success) { + ++vcpu->stat.generic.halt_successful_poll; + + if (!vcpu_valid_wakeup(vcpu)) + ++vcpu->stat.generic.halt_poll_invalid; + + stats->halt_poll_success_ns += poll_ns; + KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns); + } else { + stats->halt_poll_fail_ns += poll_ns; + KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns); + } +} + +static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + if (kvm->override_halt_poll_ns) { + /* + * Ensure kvm->max_halt_poll_ns is not read before + * kvm->override_halt_poll_ns. + * + * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL. + */ + smp_rmb(); + return READ_ONCE(kvm->max_halt_poll_ns); + } + + return READ_ONCE(halt_poll_ns); +} + +/* + * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt + * polling is enabled, busy wait for a short time before blocking to avoid the + * expensive block+unblock sequence if a wake event arrives soon after the vCPU + * is halted. + */ +void kvm_vcpu_halt(struct kvm_vcpu *vcpu) +{ + unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu); + bool halt_poll_allowed = !kvm_arch_no_poll(vcpu); + ktime_t start, cur, poll_end; + bool waited = false; + bool do_halt_poll; + u64 halt_ns; + + if (vcpu->halt_poll_ns > max_halt_poll_ns) + vcpu->halt_poll_ns = max_halt_poll_ns; + + do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns; + + start = cur = poll_end = ktime_get(); + if (do_halt_poll) { + ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns); + + do { + if (kvm_vcpu_check_block(vcpu) < 0) + goto out; + cpu_relax(); + poll_end = cur = ktime_get(); + } while (kvm_vcpu_can_poll(cur, stop)); + } + + waited = kvm_vcpu_block(vcpu); + + cur = ktime_get(); + if (waited) { + vcpu->stat.generic.halt_wait_ns += + ktime_to_ns(cur) - ktime_to_ns(poll_end); + KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist, + ktime_to_ns(cur) - ktime_to_ns(poll_end)); + } out: - block_ns = ktime_to_ns(cur) - ktime_to_ns(start); - - if (!vcpu_valid_wakeup(vcpu)) - shrink_halt_poll_ns(vcpu); - else if (halt_poll_ns) { - if (block_ns <= vcpu->halt_poll_ns) - ; - /* we had a long block, shrink polling */ - else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + /* The total time the vCPU was "halted", including polling time. */ + halt_ns = ktime_to_ns(cur) - ktime_to_ns(start); + + /* + * Note, halt-polling is considered successful so long as the vCPU was + * never actually scheduled out, i.e. even if the wake event arrived + * after of the halt-polling loop itself, but before the full wait. + */ + if (do_halt_poll) + update_halt_poll_stats(vcpu, start, poll_end, !waited); + + if (halt_poll_allowed) { + /* Recompute the max halt poll time in case it changed. */ + max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu); + + if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); - /* we had a short halt and our poll time is too small */ - else if (vcpu->halt_poll_ns < halt_poll_ns && - block_ns < halt_poll_ns) - grow_halt_poll_ns(vcpu); - } else - vcpu->halt_poll_ns = 0; + } else if (max_halt_poll_ns) { + if (halt_ns <= vcpu->halt_poll_ns) + ; + /* we had a long block, shrink polling */ + else if (vcpu->halt_poll_ns && + halt_ns > max_halt_poll_ns) + shrink_halt_poll_ns(vcpu); + /* we had a short halt and our poll time is too small */ + else if (vcpu->halt_poll_ns < max_halt_poll_ns && + halt_ns < max_halt_poll_ns) + grow_halt_poll_ns(vcpu); + } else { + vcpu->halt_poll_ns = 0; + } + } - trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); - kvm_arch_vcpu_block_finish(vcpu); + trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_vcpu_block); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_halt); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { - struct swait_queue_head *wqp; - - wqp = kvm_arch_vcpu_wq(vcpu); - if (swq_has_sleeper(wqp)) { - swake_up_one(wqp); - ++vcpu->stat.halt_wakeup; + if (__kvm_vcpu_wake_up(vcpu)) { + WRITE_ONCE(vcpu->ready, true); + ++vcpu->stat.generic.halt_wakeup; return true; } return false; } -EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_wake_up); #ifndef CONFIG_S390 /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. */ -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) { - int me; - int cpu = vcpu->cpu; + int me, cpu; if (kvm_vcpu_wake_up(vcpu)) return; me = get_cpu(); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (kvm_arch_vcpu_should_kick(vcpu)) - smp_send_reschedule(cpu); + /* + * The only state change done outside the vcpu mutex is IN_GUEST_MODE + * to EXITING_GUEST_MODE. Therefore the moderately expensive "should + * kick" check does not need atomic operations if kvm_vcpu_kick is used + * within the vCPU thread itself. + */ + if (vcpu == __this_cpu_read(kvm_running_vcpu)) { + if (vcpu->mode == IN_GUEST_MODE) + WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE); + goto out; + } + + /* + * Note, the vCPU could get migrated to a different pCPU at any point + * after kvm_arch_vcpu_should_kick(), which could result in sending an + * IPI to the previous pCPU. But, that's ok because the purpose of the + * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the + * vCPU also requires it to leave IN_GUEST_MODE. + */ + if (kvm_arch_vcpu_should_kick(vcpu)) { + cpu = READ_ONCE(vcpu->cpu); + if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) { + /* + * Use a reschedule IPI to kick the vCPU if the caller + * doesn't need to wait for a response, as KVM allows + * kicking vCPUs while IRQs are disabled, but using the + * SMP function call framework with IRQs disabled can + * deadlock due to taking cross-CPU locks. + */ + if (wait) + smp_call_function_single(cpu, ack_kick, NULL, wait); + else + smp_send_reschedule(cpu); + } + } +out: put_cpu(); } -EXPORT_SYMBOL_GPL(kvm_vcpu_kick); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) { - struct pid *pid; struct task_struct *task = NULL; - int ret = 0; + int ret; + + if (!read_trylock(&target->pid_lock)) + return 0; + + if (target->pid) + task = get_pid_task(target->pid, PIDTYPE_PID); + + read_unlock(&target->pid_lock); - rcu_read_lock(); - pid = rcu_dereference(target->pid); - if (pid) - task = get_pid_task(pid, PIDTYPE_PID); - rcu_read_unlock(); if (!task) - return ret; + return 0; ret = yield_to(task, 1); put_task_struct(task); return ret; } -EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_yield_to); /* * Helper that checks whether a VCPU is eligible for directed yield. @@ -2368,7 +3891,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); * * (a) VCPU which has not done pl-exit or cpu relax intercepted recently * (preempted lock holder), indicated by @in_spin_loop. - * Set at the beiginning and cleared at the end of interception/PLE handler. + * Set at the beginning and cleared at the end of interception/PLE handler. * * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get * chance last time (mostly it has become eligible now since we have probably @@ -2401,51 +3924,113 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) #endif } +/* + * Unlike kvm_arch_vcpu_runnable, this function is called outside + * a vcpu_load/vcpu_put pair. However, for most architectures + * kvm_arch_vcpu_runnable does not require vcpu_load. + */ +bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_runnable(vcpu); +} + +static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (kvm_arch_dy_runnable(vcpu)) + return true; + +#ifdef CONFIG_KVM_ASYNC_PF + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; +#endif + + return false; +} + +/* + * By default, simply query the target vCPU's current mode when checking if a + * vCPU was preempted in kernel mode. All architectures except x86 (or more + * specifical, except VMX) allow querying whether or not a vCPU is in kernel + * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel() + * directly for cross-vCPU checks is functionally correct and accurate. + */ +bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_in_kernel(vcpu); +} + +bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return false; +} + void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { + int nr_vcpus, start, i, idx, yielded; struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; - int last_boosted_vcpu = me->kvm->last_boosted_vcpu; - int yielded = 0; int try = 3; - int pass; - int i; + + nr_vcpus = atomic_read(&kvm->online_vcpus); + if (nr_vcpus < 2) + return; + + /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */ + smp_rmb(); kvm_vcpu_set_in_spin_loop(me, true); + /* - * We boost the priority of a VCPU that is runnable but not - * currently running, because it got preempted by something - * else and called schedule in __vcpu_run. Hopefully that - * VCPU is holding the lock that we need and will release it. - * We approximate round-robin by starting at the last boosted VCPU. + * The current vCPU ("me") is spinning in kernel mode, i.e. is likely + * waiting for a resource to become available. Attempt to yield to a + * vCPU that is runnable, but not currently running, e.g. because the + * vCPU was preempted by a higher priority task. With luck, the vCPU + * that was preempted is holding a lock or some other resource that the + * current vCPU is waiting to acquire, and yielding to the other vCPU + * will allow it to make forward progress and release the lock (or kick + * the spinning vCPU, etc). + * + * Since KVM has no insight into what exactly the guest is doing, + * approximate a round-robin selection by iterating over all vCPUs, + * starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu, + * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed. + * + * Note, this is inherently racy, e.g. if multiple vCPUs are spinning, + * they may all try to yield to the same vCPU(s). But as above, this + * is all best effort due to KVM's lack of visibility into the guest. */ - for (pass = 0; pass < 2 && !yielded && try; pass++) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!pass && i <= last_boosted_vcpu) { - i = last_boosted_vcpu; - continue; - } else if (pass && i > last_boosted_vcpu) - break; - if (!READ_ONCE(vcpu->preempted)) - continue; - if (vcpu == me) - continue; - if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) - continue; - if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) - continue; - if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) - continue; + start = READ_ONCE(kvm->last_boosted_vcpu) + 1; + for (i = 0; i < nr_vcpus; i++) { + idx = (start + i) % nr_vcpus; + if (idx == me->vcpu_idx) + continue; - yielded = kvm_vcpu_yield_to(vcpu); - if (yielded > 0) { - kvm->last_boosted_vcpu = i; - break; - } else if (yielded < 0) { - try--; - if (!try) - break; - } + vcpu = xa_load(&kvm->vcpu_array, idx); + if (!READ_ONCE(vcpu->ready)) + continue; + if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) + continue; + + /* + * Treat the target vCPU as being in-kernel if it has a pending + * interrupt, as the vCPU trying to yield may be spinning + * waiting on IPI delivery, i.e. the target vCPU is in-kernel + * for the purposes of directed yield. + */ + if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && + !kvm_arch_dy_has_pending_interrupt(vcpu) && + !kvm_arch_vcpu_preempted_in_kernel(vcpu)) + continue; + + if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) + continue; + + yielded = kvm_vcpu_yield_to(vcpu); + if (yielded > 0) { + WRITE_ONCE(kvm->last_boosted_vcpu, idx); + break; + } else if (yielded < 0 && !--try) { + break; } } kvm_vcpu_set_in_spin_loop(me, false); @@ -2453,7 +4038,18 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) /* Ensure vcpu is not eligible during next spinloop */ kvm_vcpu_set_dy_eligible(me, false); } -EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_on_spin); + +static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff) +{ +#ifdef CONFIG_HAVE_KVM_DIRTY_RING + return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) && + (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET + + kvm->dirty_ring_size / PAGE_SIZE); +#else + return false; +#endif +} static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) { @@ -2470,6 +4066,10 @@ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif + else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff)) + page = kvm_dirty_ring_get_page( + &vcpu->dirty_ring, + vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET); else return kvm_arch_vcpu_fault(vcpu, vmf); get_page(page); @@ -2483,6 +4083,14 @@ static const struct vm_operations_struct kvm_vcpu_vm_ops = { static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) { + struct kvm_vcpu *vcpu = file->private_data; + unsigned long pages = vma_pages(vma); + + if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) || + kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) && + ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED))) + return -EINVAL; + vma->vm_ops = &kvm_vcpu_vm_ops; return 0; } @@ -2491,7 +4099,6 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp) { struct kvm_vcpu *vcpu = filp->private_data; - debugfs_remove_recursive(vcpu->debugfs_dentry); kvm_put_kvm(vcpu->kvm); return 0; } @@ -2515,102 +4122,153 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } -static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS +static int vcpu_get_pid(void *data, u64 *val) { - char dir_name[ITOA_MAX_LEN * 2]; - int ret; + struct kvm_vcpu *vcpu = data; - if (!kvm_arch_has_vcpu_debugfs()) - return 0; + read_lock(&vcpu->pid_lock); + *val = pid_nr(vcpu->pid); + read_unlock(&vcpu->pid_lock); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n"); + +static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + struct dentry *debugfs_dentry; + char dir_name[ITOA_MAX_LEN * 2]; if (!debugfs_initialized()) - return 0; + return; snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); - vcpu->debugfs_dentry = debugfs_create_dir(dir_name, - vcpu->kvm->debugfs_dentry); - if (!vcpu->debugfs_dentry) - return -ENOMEM; + debugfs_dentry = debugfs_create_dir(dir_name, + vcpu->kvm->debugfs_dentry); + debugfs_create_file("pid", 0444, debugfs_dentry, vcpu, + &vcpu_get_pid_fops); - ret = kvm_arch_create_vcpu_debugfs(vcpu); - if (ret < 0) { - debugfs_remove_recursive(vcpu->debugfs_dentry); - return ret; - } - - return 0; + kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); } +#endif /* * Creates some virtual cpus. Good luck creating more than one. */ -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) { int r; struct kvm_vcpu *vcpu; + struct page *page; - if (id >= KVM_MAX_VCPU_ID) + /* + * KVM tracks vCPU IDs as 'int', be kind to userspace and reject + * too-large values instead of silently truncating. + * + * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first + * changing the storage type (at the very least, IDs should be tracked + * as unsigned ints). + */ + BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX); + if (id >= KVM_MAX_VCPU_IDS) return -EINVAL; mutex_lock(&kvm->lock); - if (kvm->created_vcpus == KVM_MAX_VCPUS) { + if (kvm->created_vcpus >= kvm->max_vcpus) { mutex_unlock(&kvm->lock); return -EINVAL; } + r = kvm_arch_vcpu_precreate(kvm, id); + if (r) { + mutex_unlock(&kvm->lock); + return r; + } + kvm->created_vcpus++; mutex_unlock(&kvm->lock); - vcpu = kvm_arch_vcpu_create(kvm, id); - if (IS_ERR(vcpu)) { - r = PTR_ERR(vcpu); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); + if (!vcpu) { + r = -ENOMEM; goto vcpu_decrement; } - preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto vcpu_free; + } + vcpu->run = page_address(page); - r = kvm_arch_vcpu_setup(vcpu); - if (r) - goto vcpu_destroy; + kvm_vcpu_init(vcpu, kvm, id); - r = kvm_create_vcpu_debugfs(vcpu); + r = kvm_arch_vcpu_create(vcpu); if (r) - goto vcpu_destroy; + goto vcpu_free_run_page; + + if (kvm->dirty_ring_size) { + r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring, + id, kvm->dirty_ring_size); + if (r) + goto arch_vcpu_destroy; + } mutex_lock(&kvm->lock); + if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; } - BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); + vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); + r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); + WARN_ON_ONCE(r == -EBUSY); + if (r) + goto unlock_vcpu_destroy; - /* Now it's all set up, let userspace reach it */ + /* + * Now it's all set up, let userspace reach it. Grab the vCPU's mutex + * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully + * visible (per online_vcpus), e.g. so that KVM doesn't get tricked + * into a NULL-pointer dereference because KVM thinks the _current_ + * vCPU doesn't exist. As a bonus, taking vcpu->mutex ensures lockdep + * knows it's taken *inside* kvm->lock. + */ + mutex_lock(&vcpu->mutex); kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); - if (r < 0) { - kvm_put_kvm(kvm); - goto unlock_vcpu_destroy; - } - - kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; + if (r < 0) + goto kvm_put_xa_erase; /* - * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus - * before kvm->online_vcpu's incremented value. + * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu + * pointer before kvm->online_vcpu's incremented value. */ smp_wmb(); atomic_inc(&kvm->online_vcpus); + mutex_unlock(&vcpu->mutex); mutex_unlock(&kvm->lock); kvm_arch_vcpu_postcreate(vcpu); + kvm_create_vcpu_debugfs(vcpu); return r; +kvm_put_xa_erase: + mutex_unlock(&vcpu->mutex); + kvm_put_kvm_no_destroy(kvm); + xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); unlock_vcpu_destroy: mutex_unlock(&kvm->lock); - debugfs_remove_recursive(vcpu->debugfs_dentry); -vcpu_destroy: + kvm_dirty_ring_free(&vcpu->dirty_ring); +arch_vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); +vcpu_free_run_page: + free_page((unsigned long)vcpu->run); +vcpu_free: + kmem_cache_free(kvm_vcpu_cache, vcpu); vcpu_decrement: mutex_lock(&kvm->lock); kvm->created_vcpus--; @@ -2629,6 +4287,129 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) return 0; } +static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer, + size_t size, loff_t *offset) +{ + struct kvm_vcpu *vcpu = file->private_data; + + return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header, + &kvm_vcpu_stats_desc[0], &vcpu->stat, + sizeof(vcpu->stat), user_buffer, size, offset); +} + +static int kvm_vcpu_stats_release(struct inode *inode, struct file *file) +{ + struct kvm_vcpu *vcpu = file->private_data; + + kvm_put_kvm(vcpu->kvm); + return 0; +} + +static const struct file_operations kvm_vcpu_stats_fops = { + .owner = THIS_MODULE, + .read = kvm_vcpu_stats_read, + .release = kvm_vcpu_stats_release, + .llseek = noop_llseek, +}; + +static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) +{ + int fd; + struct file *file; + char name[15 + ITOA_MAX_LEN + 1]; + + snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id); + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu, + O_RDONLY, FMODE_PREAD); + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); + } + + kvm_get_kvm(vcpu->kvm); + fd_install(fd, file); + + return fd; +} + +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + int idx; + long r; + u64 full_size; + + if (range->flags) + return -EINVAL; + + if (!PAGE_ALIGNED(range->gpa) || + !PAGE_ALIGNED(range->size) || + range->gpa + range->size <= range->gpa) + return -EINVAL; + + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + full_size = range->size; + do { + if (signal_pending(current)) { + r = -EINTR; + break; + } + + r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); + if (WARN_ON_ONCE(r == 0 || r == -EIO)) + break; + + if (r < 0) + break; + + range->size -= r; + range->gpa += r; + cond_resched(); + } while (range->size); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu_put(vcpu); + + /* Return success if at least one page was mapped successfully. */ + return full_size == range->size ? r : 0; +} +#endif + +static int kvm_wait_for_vcpu_online(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + /* + * In practice, this happy path will always be taken, as a well-behaved + * VMM will never invoke a vCPU ioctl() before KVM_CREATE_VCPU returns. + */ + if (likely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus))) + return 0; + + /* + * Acquire and release the vCPU's mutex to wait for vCPU creation to + * complete (kvm_vm_ioctl_create_vcpu() holds the mutex until the vCPU + * is fully online). + */ + if (mutex_lock_killable(&vcpu->mutex)) + return -EINTR; + + mutex_unlock(&vcpu->mutex); + + if (WARN_ON_ONCE(!kvm_get_vcpu(kvm, vcpu->vcpu_idx))) + return -EIO; + + return 0; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2638,17 +4419,26 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_fpu *fpu = NULL; struct kvm_sregs *kvm_sregs = NULL; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) return -EIO; if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) return -EINVAL; /* - * Some architectures have vcpu ioctls that are asynchronous to vcpu - * execution; mutex_lock() would break them. + * Wait for the vCPU to be online before handling the ioctl(), as KVM + * assumes the vCPU is reachable via vcpu_array, i.e. may dereference + * a NULL pointer if userspace invokes an ioctl() before KVM is ready. */ - r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); + r = kvm_wait_for_vcpu_online(vcpu); + if (r) + return r; + + /* + * Let arch code handle select vCPU ioctls without holding vcpu->mutex, + * e.g. to support ioctls that can run asynchronous to vCPU execution. + */ + r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg); if (r != -ENOIOCTLCMD) return r; @@ -2660,7 +4450,14 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EINVAL; if (arg) goto out; - oldpid = rcu_access_pointer(vcpu->pid); + + /* + * Note, vcpu->pid is primarily protected by vcpu->mutex. The + * dedicated r/w lock allows other tasks, e.g. other vCPUs, to + * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield + * directly to this vCPU + */ + oldpid = vcpu->pid; if (unlikely(oldpid != task_pid(current))) { /* The thread running this VCPU changed. */ struct pid *newpid; @@ -2670,12 +4467,22 @@ static long kvm_vcpu_ioctl(struct file *filp, break; newpid = get_task_pid(current, PIDTYPE_PID); - rcu_assign_pointer(vcpu->pid, newpid); - if (oldpid) - synchronize_rcu(); + write_lock(&vcpu->pid_lock); + vcpu->pid = newpid; + write_unlock(&vcpu->pid_lock); + put_pid(oldpid); } - r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe); + r = kvm_arch_vcpu_ioctl_run(vcpu); + vcpu->wants_to_run = false; + + /* + * FIXME: Remove this hack once all KVM architectures + * support the generic TIF bits, i.e. a dedicated TIF_RSEQ. + */ + rseq_virt_userspace_exit(); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } @@ -2700,7 +4507,6 @@ out_free1: case KVM_SET_REGS: { struct kvm_regs *kvm_regs; - r = -ENOMEM; kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); if (IS_ERR(kvm_regs)) { r = PTR_ERR(kvm_regs); @@ -2826,6 +4632,24 @@ out_free1: r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; } + case KVM_GET_STATS_FD: { + r = kvm_vcpu_ioctl_get_stats_fd(vcpu); + break; + } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY + case KVM_PRE_FAULT_MEMORY: { + struct kvm_pre_fault_memory range; + + r = -EFAULT; + if (copy_from_user(&range, argp, sizeof(range))) + break; + r = kvm_vcpu_pre_fault_memory(vcpu, &range); + /* Pass back leftover range. */ + if (copy_to_user(argp, &range, sizeof(range))) + r = -EFAULT; + break; + } +#endif default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } @@ -2844,7 +4668,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, void __user *argp = compat_ptr(arg); int r; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) return -EIO; switch (ioctl) { @@ -2862,7 +4686,8 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, if (kvm_sigmask.len != sizeof(compat_sigset_t)) goto out; r = -EFAULT; - if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) + if (get_compat_sigset(&sigset, + (compat_sigset_t __user *)sigmask_arg->sigset)) goto out; r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); } else @@ -2878,6 +4703,16 @@ out: } #endif +static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct kvm_device *dev = filp->private_data; + + if (dev->ops->mmap) + return dev->ops->mmap(dev, vma); + + return -ENODEV; +} + static int kvm_device_ioctl_attr(struct kvm_device *dev, int (*accessor)(struct kvm_device *dev, struct kvm_device_attr *attr), @@ -2899,6 +4734,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, { struct kvm_device *dev = filp->private_data; + if (dev->kvm->mm != current->mm || dev->kvm->vm_dead) + return -EIO; + switch (ioctl) { case KVM_SET_DEVICE_ATTR: return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); @@ -2919,14 +4757,23 @@ static int kvm_device_release(struct inode *inode, struct file *filp) struct kvm_device *dev = filp->private_data; struct kvm *kvm = dev->kvm; + if (dev->ops->release) { + mutex_lock(&kvm->lock); + list_del_rcu(&dev->vm_node); + synchronize_rcu(); + dev->ops->release(dev); + mutex_unlock(&kvm->lock); + } + kvm_put_kvm(kvm); return 0; } -static const struct file_operations kvm_device_fops = { +static struct file_operations kvm_device_fops = { .unlocked_ioctl = kvm_device_ioctl, .release = kvm_device_release, KVM_COMPAT(kvm_device_ioctl), + .mmap = kvm_device_mmap, }; struct kvm_device *kvm_device_from_filp(struct file *filp) @@ -2937,14 +4784,14 @@ struct kvm_device *kvm_device_from_filp(struct file *filp) return filp->private_data; } -static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { +static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_MPIC [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, #endif }; -int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) +int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) { if (type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENOSPC; @@ -2965,22 +4812,24 @@ void kvm_unregister_device_ops(u32 type) static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { - struct kvm_device_ops *ops = NULL; + const struct kvm_device_ops *ops; struct kvm_device *dev; bool test = cd->flags & KVM_CREATE_DEVICE_TEST; + int type; int ret; if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENODEV; - ops = kvm_device_ops_table[cd->type]; + type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); + ops = kvm_device_ops_table[type]; if (ops == NULL) return -ENODEV; if (test) return 0; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; @@ -2988,52 +4837,55 @@ static int kvm_ioctl_create_device(struct kvm *kvm, dev->kvm = kvm; mutex_lock(&kvm->lock); - ret = ops->create(dev, cd->type); + ret = ops->create(dev, type); if (ret < 0) { mutex_unlock(&kvm->lock); kfree(dev); return ret; } - list_add(&dev->vm_node, &kvm->devices); + list_add_rcu(&dev->vm_node, &kvm->devices); mutex_unlock(&kvm->lock); if (ops->init) ops->init(dev); + kvm_get_kvm(kvm); ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { + kvm_put_kvm_no_destroy(kvm); mutex_lock(&kvm->lock); - list_del(&dev->vm_node); + list_del_rcu(&dev->vm_node); + synchronize_rcu(); + if (ops->release) + ops->release(dev); mutex_unlock(&kvm->lock); - ops->destroy(dev); + if (ops->destroy) + ops->destroy(dev); return ret; } - kvm_get_kvm(kvm); cd->fd = ret; return 0; } -static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) +static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) { switch (arg) { case KVM_CAP_USER_MEMORY: + case KVM_CAP_USER_MEMORY2: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: case KVM_CAP_INTERNAL_ERROR_DATA: #ifdef CONFIG_HAVE_KVM_MSI case KVM_CAP_SIGNAL_MSI: #endif -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQFD: - case KVM_CAP_IRQFD_RESAMPLE: #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: -#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: -#endif + case KVM_CAP_HALT_POLL: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3041,44 +4893,264 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_COALESCED_PIO: return 1; #endif +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: + return KVM_DIRTY_LOG_MANUAL_CAPS; +#endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; #endif -#if KVM_ADDRESS_SPACE_NUM > 1 +#if KVM_MAX_NR_ADDRESS_SPACES > 1 case KVM_CAP_MULTI_ADDRESS_SPACE: - return KVM_ADDRESS_SPACE_NUM; + if (kvm) + return kvm_arch_nr_memslot_as_ids(kvm); + return KVM_MAX_NR_ADDRESS_SPACES; +#endif + case KVM_CAP_NR_MEMSLOTS: + return KVM_USER_MEM_SLOTS; + case KVM_CAP_DIRTY_LOG_RING: +#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO + return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); +#else + return 0; +#endif + case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: +#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL + return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); +#else + return 0; +#endif +#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP + case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: +#endif + case KVM_CAP_BINARY_STATS_FD: + case KVM_CAP_SYSTEM_EVENT_DATA: + case KVM_CAP_DEVICE_CTRL: + return 1; +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + case KVM_CAP_MEMORY_ATTRIBUTES: + return kvm_supported_mem_attributes(kvm); +#endif +#ifdef CONFIG_KVM_GUEST_MEMFD + case KVM_CAP_GUEST_MEMFD: + return 1; + case KVM_CAP_GUEST_MEMFD_FLAGS: + return kvm_gmem_get_supported_flags(kvm); #endif - case KVM_CAP_MAX_VCPU_ID: - return KVM_MAX_VCPU_ID; default: break; } return kvm_vm_ioctl_check_extension(kvm, arg); } +static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) +{ + int r; + + if (!KVM_DIRTY_LOG_PAGE_OFFSET) + return -EINVAL; + + /* the size should be power of 2 */ + if (!size || (size & (size - 1))) + return -EINVAL; + + /* Should be bigger to keep the reserved entries, or a page */ + if (size < kvm_dirty_ring_get_rsvd_entries(kvm) * + sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) + return -EINVAL; + + if (size > KVM_DIRTY_RING_MAX_ENTRIES * + sizeof(struct kvm_dirty_gfn)) + return -E2BIG; + + /* We only allow it to set once */ + if (kvm->dirty_ring_size) + return -EINVAL; + + mutex_lock(&kvm->lock); + + if (kvm->created_vcpus) { + /* We don't allow to change this value after vcpu created */ + r = -EINVAL; + } else { + kvm->dirty_ring_size = size; + r = 0; + } + + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + int cleared = 0, r; + + if (!kvm->dirty_ring_size) + return -EINVAL; + + mutex_lock(&kvm->slots_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + r = kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring, &cleared); + if (r) + break; + } + + mutex_unlock(&kvm->slots_lock); + + if (cleared) + kvm_flush_remote_tlbs(kvm); + + return cleared; +} + int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { return -EINVAL; } +bool kvm_are_all_memslots_empty(struct kvm *kvm) +{ + int i; + + lockdep_assert_held(&kvm->slots_lock); + + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { + if (!kvm_memslots_empty(__kvm_memslots(kvm, i))) + return false; + } + + return true; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_are_all_memslots_empty); + static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { switch (cap->cap) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: - if (cap->flags || (cap->args[0] & ~1)) + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { + u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; + + if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) + allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; + + if (cap->flags || (cap->args[0] & ~allowed_options)) return -EINVAL; kvm->manual_dirty_log_protect = cap->args[0]; return 0; + } #endif + case KVM_CAP_HALT_POLL: { + if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) + return -EINVAL; + + kvm->max_halt_poll_ns = cap->args[0]; + + /* + * Ensure kvm->override_halt_poll_ns does not become visible + * before kvm->max_halt_poll_ns. + * + * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns(). + */ + smp_wmb(); + kvm->override_halt_poll_ns = true; + + return 0; + } + case KVM_CAP_DIRTY_LOG_RING: + case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: + if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap)) + return -EINVAL; + + return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); + case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: { + int r = -EINVAL; + + if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) || + !kvm->dirty_ring_size || cap->flags) + return r; + + mutex_lock(&kvm->slots_lock); + + /* + * For simplicity, allow enabling ring+bitmap if and only if + * there are no memslots, e.g. to ensure all memslots allocate + * a bitmap after the capability is enabled. + */ + if (kvm_are_all_memslots_empty(kvm)) { + kvm->dirty_ring_with_bitmap = true; + r = 0; + } + + mutex_unlock(&kvm->slots_lock); + + return r; + } default: return kvm_vm_ioctl_enable_cap(kvm, cap); } } +static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer, + size_t size, loff_t *offset) +{ + struct kvm *kvm = file->private_data; + + return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header, + &kvm_vm_stats_desc[0], &kvm->stat, + sizeof(kvm->stat), user_buffer, size, offset); +} + +static int kvm_vm_stats_release(struct inode *inode, struct file *file) +{ + struct kvm *kvm = file->private_data; + + kvm_put_kvm(kvm); + return 0; +} + +static const struct file_operations kvm_vm_stats_fops = { + .owner = THIS_MODULE, + .read = kvm_vm_stats_read, + .release = kvm_vm_stats_release, + .llseek = noop_llseek, +}; + +static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) +{ + int fd; + struct file *file; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + file = anon_inode_getfile_fmode("kvm-vm-stats", + &kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD); + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); + } + + kvm_get_kvm(kvm); + fd_install(fd, file); + + return fd; +} + +#define SANITY_CHECK_MEM_REGION_FIELD(field) \ +do { \ + BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \ + offsetof(struct kvm_userspace_memory_region2, field)); \ + BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \ + sizeof_field(struct kvm_userspace_memory_region2, field)); \ +} while (0) + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3086,7 +5158,7 @@ static long kvm_vm_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; switch (ioctl) { case KVM_CREATE_VCPU: @@ -3101,15 +5173,39 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); break; } + case KVM_SET_USER_MEMORY_REGION2: case KVM_SET_USER_MEMORY_REGION: { - struct kvm_userspace_memory_region kvm_userspace_mem; + struct kvm_userspace_memory_region2 mem; + unsigned long size; + + if (ioctl == KVM_SET_USER_MEMORY_REGION) { + /* + * Fields beyond struct kvm_userspace_memory_region shouldn't be + * accessed, but avoid leaking kernel memory in case of a bug. + */ + memset(&mem, 0, sizeof(mem)); + size = sizeof(struct kvm_userspace_memory_region); + } else { + size = sizeof(struct kvm_userspace_memory_region2); + } + + /* Ensure the common parts of the two structs are identical. */ + SANITY_CHECK_MEM_REGION_FIELD(slot); + SANITY_CHECK_MEM_REGION_FIELD(flags); + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); + SANITY_CHECK_MEM_REGION_FIELD(memory_size); + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); r = -EFAULT; - if (copy_from_user(&kvm_userspace_mem, argp, - sizeof(kvm_userspace_mem))) + if (copy_from_user(&mem, argp, size)) + goto out; + + r = -EINVAL; + if (ioctl == KVM_SET_USER_MEMORY_REGION && + (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS)) goto out; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); + r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; } case KVM_GET_DIRTY_LOG: { @@ -3222,24 +5318,32 @@ static long kvm_vm_ioctl(struct file *filp, if (routing.flags) goto out; if (routing.nr) { - r = -ENOMEM; - entries = vmalloc(array_size(sizeof(*entries), - routing.nr)); - if (!entries) - goto out; - r = -EFAULT; urouting = argp; - if (copy_from_user(entries, urouting->entries, - routing.nr * sizeof(*entries))) - goto out_free_irq_routing; + entries = vmemdup_array_user(urouting->entries, + routing.nr, sizeof(*entries)); + if (IS_ERR(entries)) { + r = PTR_ERR(entries); + goto out; + } } r = kvm_set_irq_routing(kvm, entries, routing.nr, routing.flags); -out_free_irq_routing: - vfree(entries); + kvfree(entries); break; } #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + case KVM_SET_MEMORY_ATTRIBUTES: { + struct kvm_memory_attributes attrs; + + r = -EFAULT; + if (copy_from_user(&attrs, argp, sizeof(attrs))) + goto out; + + r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs); + break; + } +#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ case KVM_CREATE_DEVICE: { struct kvm_create_device cd; @@ -3261,6 +5365,24 @@ out_free_irq_routing: case KVM_CHECK_EXTENSION: r = kvm_vm_ioctl_check_extension_generic(kvm, arg); break; + case KVM_RESET_DIRTY_RINGS: + r = kvm_vm_ioctl_reset_dirty_pages(kvm); + break; + case KVM_GET_STATS_FD: + r = kvm_vm_ioctl_get_stats_fd(kvm); + break; +#ifdef CONFIG_KVM_GUEST_MEMFD + case KVM_CREATE_GUEST_MEMFD: { + struct kvm_create_guest_memfd guest_memfd; + + r = -EFAULT; + if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd))) + goto out; + + r = kvm_gmem_create(kvm, &guest_memfd); + break; + } +#endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } @@ -3278,15 +5400,54 @@ struct compat_kvm_dirty_log { }; }; +struct compat_kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + compat_uptr_t dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + +long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOTTY; +} + static long kvm_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; + + r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg); + if (r != -ENOTTY) + return r; + switch (ioctl) { +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CLEAR_DIRTY_LOG: { + struct compat_kvm_clear_dirty_log compat_log; + struct kvm_clear_dirty_log log; + + if (copy_from_user(&compat_log, (void __user *)arg, + sizeof(compat_log))) + return -EFAULT; + log.slot = compat_log.slot; + log.num_pages = compat_log.num_pages; + log.first_page = compat_log.first_page; + log.padding2 = compat_log.padding2; + log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); + + r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); + break; + } +#endif case KVM_GET_DIRTY_LOG: { struct compat_kvm_dirty_log compat_log; struct kvm_dirty_log log; @@ -3316,27 +5477,33 @@ static struct file_operations kvm_vm_fops = { KVM_COMPAT(kvm_vm_compat_ioctl), }; +bool file_is_kvm(struct file *file) +{ + return file && file->f_op == &kvm_vm_fops; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm); + static int kvm_dev_ioctl_create_vm(unsigned long type) { - int r; + char fdname[ITOA_MAX_LEN + 1]; + int r, fd; struct kvm *kvm; struct file *file; - kvm = kvm_create_vm(type); - if (IS_ERR(kvm)) - return PTR_ERR(kvm); -#ifdef CONFIG_KVM_MMIO - r = kvm_coalesced_mmio_init(kvm); - if (r < 0) - goto put_kvm; -#endif - r = get_unused_fd_flags(O_CLOEXEC); - if (r < 0) - goto put_kvm; + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + snprintf(fdname, sizeof(fdname), "%d", fd); + + kvm = kvm_create_vm(type, fdname); + if (IS_ERR(kvm)) { + r = PTR_ERR(kvm); + goto put_fd; + } file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (IS_ERR(file)) { - put_unused_fd(r); r = PTR_ERR(file); goto put_kvm; } @@ -3347,25 +5514,22 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) * cases it will be called by the final fput(file) and will take * care of doing kvm_put_kvm(kvm). */ - if (kvm_create_vm_debugfs(kvm, r) < 0) { - put_unused_fd(r); - fput(file); - return -ENOMEM; - } kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); - fd_install(r, file); - return r; + fd_install(fd, file); + return fd; put_kvm: kvm_put_kvm(kvm); +put_fd: + put_unused_fd(fd); return r; } static long kvm_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { - long r = -EINVAL; + int r = -EINVAL; switch (ioctl) { case KVM_GET_API_VERSION: @@ -3390,11 +5554,6 @@ static long kvm_dev_ioctl(struct file *filp, r += PAGE_SIZE; /* coalesced mmio ring page */ #endif break; - case KVM_TRACE_ENABLE: - case KVM_TRACE_PAUSE: - case KVM_TRACE_DISABLE: - r = -EOPNOTSUPP; - break; default: return kvm_arch_dev_ioctl(filp, ioctl, arg); } @@ -3414,110 +5573,212 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; -static void hardware_enable_nolock(void *junk) -{ - int cpu = raw_smp_processor_id(); - int r; +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +bool enable_virt_at_load = true; +module_param(enable_virt_at_load, bool, 0444); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load); - if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) - return; +__visible bool kvm_rebooting; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); + +static DEFINE_PER_CPU(bool, virtualization_enabled); +static DEFINE_MUTEX(kvm_usage_lock); +static int kvm_usage_count; - cpumask_set_cpu(cpu, cpus_hardware_enabled); +__weak void kvm_arch_enable_virtualization(void) +{ - r = kvm_arch_hardware_enable(); +} + +__weak void kvm_arch_disable_virtualization(void) +{ - if (r) { - cpumask_clear_cpu(cpu, cpus_hardware_enabled); - atomic_inc(&hardware_enable_failed); - pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); - } } -static int kvm_starting_cpu(unsigned int cpu) +static int kvm_enable_virtualization_cpu(void) { - raw_spin_lock(&kvm_count_lock); - if (kvm_usage_count) - hardware_enable_nolock(NULL); - raw_spin_unlock(&kvm_count_lock); + if (__this_cpu_read(virtualization_enabled)) + return 0; + + if (kvm_arch_enable_virtualization_cpu()) { + pr_info("kvm: enabling virtualization on CPU%d failed\n", + raw_smp_processor_id()); + return -EIO; + } + + __this_cpu_write(virtualization_enabled, true); return 0; } -static void hardware_disable_nolock(void *junk) +static int kvm_online_cpu(unsigned int cpu) { - int cpu = raw_smp_processor_id(); + /* + * Abort the CPU online process if hardware virtualization cannot + * be enabled. Otherwise running VMs would encounter unrecoverable + * errors when scheduled to this CPU. + */ + return kvm_enable_virtualization_cpu(); +} - if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) +static void kvm_disable_virtualization_cpu(void *ign) +{ + if (!__this_cpu_read(virtualization_enabled)) return; - cpumask_clear_cpu(cpu, cpus_hardware_enabled); - kvm_arch_hardware_disable(); + + kvm_arch_disable_virtualization_cpu(); + + __this_cpu_write(virtualization_enabled, false); } -static int kvm_dying_cpu(unsigned int cpu) +static int kvm_offline_cpu(unsigned int cpu) { - raw_spin_lock(&kvm_count_lock); - if (kvm_usage_count) - hardware_disable_nolock(NULL); - raw_spin_unlock(&kvm_count_lock); + kvm_disable_virtualization_cpu(NULL); return 0; } -static void hardware_disable_all_nolock(void) +static void kvm_shutdown(void *data) { - BUG_ON(!kvm_usage_count); + /* + * Disable hardware virtualization and set kvm_rebooting to indicate + * that KVM has asynchronously disabled hardware virtualization, i.e. + * that relevant errors and exceptions aren't entirely unexpected. + * Some flavors of hardware virtualization need to be disabled before + * transferring control to firmware (to perform shutdown/reboot), e.g. + * on x86, virtualization can block INIT interrupts, which are used by + * firmware to pull APs back under firmware control. Note, this path + * is used for both shutdown and reboot scenarios, i.e. neither name is + * 100% comprehensive. + */ + pr_info("kvm: exiting hardware virtualization\n"); + kvm_rebooting = true; + on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); +} + +static int kvm_suspend(void *data) +{ + /* + * Secondary CPUs and CPU hotplug are disabled across the suspend/resume + * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage + * count is stable. Assert that kvm_usage_lock is not held to ensure + * the system isn't suspended while KVM is enabling hardware. Hardware + * enabling can be preempted, but the task cannot be frozen until it has + * dropped all locks (userspace tasks are frozen via a fake signal). + */ + lockdep_assert_not_held(&kvm_usage_lock); + lockdep_assert_irqs_disabled(); - kvm_usage_count--; - if (!kvm_usage_count) - on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_disable_virtualization_cpu(NULL); + return 0; } -static void hardware_disable_all(void) +static void kvm_resume(void *data) { - raw_spin_lock(&kvm_count_lock); - hardware_disable_all_nolock(); - raw_spin_unlock(&kvm_count_lock); + lockdep_assert_not_held(&kvm_usage_lock); + lockdep_assert_irqs_disabled(); + + WARN_ON_ONCE(kvm_enable_virtualization_cpu()); } -static int hardware_enable_all(void) +static const struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, + .shutdown = kvm_shutdown, +}; + +static struct syscore kvm_syscore = { + .ops = &kvm_syscore_ops, +}; + +int kvm_enable_virtualization(void) { - int r = 0; + int r; - raw_spin_lock(&kvm_count_lock); + guard(mutex)(&kvm_usage_lock); - kvm_usage_count++; - if (kvm_usage_count == 1) { - atomic_set(&hardware_enable_failed, 0); - on_each_cpu(hardware_enable_nolock, NULL, 1); + if (kvm_usage_count++) + return 0; - if (atomic_read(&hardware_enable_failed)) { - hardware_disable_all_nolock(); - r = -EBUSY; - } + kvm_arch_enable_virtualization(); + + r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", + kvm_online_cpu, kvm_offline_cpu); + if (r) + goto err_cpuhp; + + register_syscore(&kvm_syscore); + + /* + * Undo virtualization enabling and bail if the system is going down. + * If userspace initiated a forced reboot, e.g. reboot -f, then it's + * possible for an in-flight operation to enable virtualization after + * syscore_shutdown() is called, i.e. without kvm_shutdown() being + * invoked. Note, this relies on system_state being set _before_ + * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked + * or this CPU observes the impending shutdown. Which is why KVM uses + * a syscore ops hook instead of registering a dedicated reboot + * notifier (the latter runs before system_state is updated). + */ + if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || + system_state == SYSTEM_RESTART) { + r = -EBUSY; + goto err_rebooting; } - raw_spin_unlock(&kvm_count_lock); + return 0; +err_rebooting: + unregister_syscore(&kvm_syscore); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); +err_cpuhp: + kvm_arch_disable_virtualization(); + --kvm_usage_count; return r; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization); -static int kvm_reboot(struct notifier_block *notifier, unsigned long val, - void *v) +void kvm_disable_virtualization(void) { - /* - * Some (well, at least mine) BIOSes hang on reboot if - * in vmx root mode. - * - * And Intel TXT required VMX off for all cpu when system shutdown. - */ - pr_info("kvm: exiting hardware virtualization\n"); - kvm_rebooting = true; - on_each_cpu(hardware_disable_nolock, NULL, 1); - return NOTIFY_OK; + guard(mutex)(&kvm_usage_lock); + + if (--kvm_usage_count) + return; + + unregister_syscore(&kvm_syscore); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); + kvm_arch_disable_virtualization(); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization); -static struct notifier_block kvm_reboot_notifier = { - .notifier_call = kvm_reboot, - .priority = 0, -}; +static int kvm_init_virtualization(void) +{ + if (enable_virt_at_load) + return kvm_enable_virtualization(); + + return 0; +} + +static void kvm_uninit_virtualization(void) +{ + if (enable_virt_at_load) + kvm_disable_virtualization(); +} +#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static int kvm_init_virtualization(void) +{ + return 0; +} + +static void kvm_uninit_virtualization(void) +{ + +} +#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ + +static void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + if (dev->ops->destructor) + dev->ops->destructor(dev); +} static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { @@ -3605,7 +5866,18 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -/* kvm_io_bus_write - called under kvm->slots_lock */ +static struct kvm_io_bus *kvm_get_bus_srcu(struct kvm *kvm, enum kvm_bus idx) +{ + /* + * Ensure that any updates to kvm_buses[] observed by the previous vCPU + * machine instruction are also visible to the vCPU machine instruction + * that triggered this call. + */ + smp_mb__after_srcu_read_lock(); + + return srcu_dereference(kvm->buses[idx], &kvm->srcu); +} + int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { @@ -3618,14 +5890,14 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_write); -/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) { @@ -3637,7 +5909,7 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; @@ -3674,9 +5946,7 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -EXPORT_SYMBOL_GPL(kvm_io_bus_write); -/* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { @@ -3689,15 +5959,21 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_read); + +static void __free_bus(struct rcu_head *rcu) +{ + struct kvm_io_bus *bus = container_of(rcu, struct kvm_io_bus, rcu); + kfree(bus); +} -/* Caller must hold slots_lock. */ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) { @@ -3705,6 +5981,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, struct kvm_io_bus *new_bus, *bus; struct kvm_io_range range; + lockdep_assert_held(&kvm->slots_lock); + bus = kvm_get_bus(kvm, bus_idx); if (!bus) return -ENOMEM; @@ -3713,8 +5991,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; - new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * - sizeof(struct kvm_io_range)), GFP_KERNEL); + new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), + GFP_KERNEL_ACCOUNT); if (!new_bus) return -ENOMEM; @@ -3734,48 +6012,57 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, memcpy(new_bus->range + i + 1, bus->range + i, (bus->dev_count - i) * sizeof(struct kvm_io_range)); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); - synchronize_srcu_expedited(&kvm->srcu); - kfree(bus); + call_srcu(&kvm->srcu, &bus->rcu, __free_bus); return 0; } -/* Caller must hold slots_lock. */ -void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev) +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { int i; struct kvm_io_bus *new_bus, *bus; + lockdep_assert_held(&kvm->slots_lock); + bus = kvm_get_bus(kvm, bus_idx); if (!bus) - return; + return 0; - for (i = 0; i < bus->dev_count; i++) + for (i = 0; i < bus->dev_count; i++) { if (bus->range[i].dev == dev) { break; } + } if (i == bus->dev_count) - return; + return 0; - new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * - sizeof(struct kvm_io_range)), GFP_KERNEL); - if (!new_bus) { - pr_err("kvm: failed to shrink bus, removing it completely\n"); - goto broken; + new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), + GFP_KERNEL_ACCOUNT); + if (new_bus) { + memcpy(new_bus, bus, struct_size(bus, range, i)); + new_bus->dev_count--; + memcpy(new_bus->range + i, bus->range + i + 1, + flex_array_size(new_bus, range, new_bus->dev_count - i)); } - memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); - new_bus->dev_count--; - memcpy(new_bus->range + i, bus->range + i + 1, - (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); - -broken: rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); + + /* + * If NULL bus is installed, destroy the old bus, including all the + * attached devices. Otherwise, destroy the caller's device only. + */ + if (!new_bus) { + pr_err("kvm: failed to shrink bus, removing it completely\n"); + kvm_io_bus_destroy(bus); + return -ENOMEM; + } + + kvm_iodevice_destructor(dev); kfree(bus); - return; + return 0; } struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, @@ -3787,7 +6074,7 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, srcu_idx = srcu_read_lock(&kvm->srcu); - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + bus = kvm_get_bus_srcu(kvm, bus_idx); if (!bus) goto out_unlock; @@ -3802,35 +6089,35 @@ out_unlock: return iodev; } -EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_get_dev); static int kvm_debugfs_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt) { - struct kvm_stat_data *stat_data = (struct kvm_stat_data *) - inode->i_private; + int ret; + struct kvm_stat_data *stat_data = inode->i_private; - /* The debugfs files are a reference to the kvm struct which - * is still valid when kvm_destroy_vm is called. - * To avoid the race between open and the removal of the debugfs - * directory we test against the users count. + /* + * The debugfs files are a reference to the kvm struct which + * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe + * avoids the race between open and the removal of the debugfs directory. */ - if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) + if (!kvm_get_kvm_safe(stat_data->kvm)) return -ENOENT; - if (simple_attr_open(inode, file, get, set, fmt)) { + ret = simple_attr_open(inode, file, get, + kvm_stats_debugfs_mode(stat_data->desc) & 0222 + ? set : NULL, fmt); + if (ret) kvm_put_kvm(stat_data->kvm); - return -ENOMEM; - } - return 0; + return ret; } static int kvm_debugfs_release(struct inode *inode, struct file *file) { - struct kvm_stat_data *stat_data = (struct kvm_stat_data *) - inode->i_private; + struct kvm_stat_data *stat_data = inode->i_private; simple_attr_release(inode, file); kvm_put_kvm(stat_data->kvm); @@ -3838,108 +6125,113 @@ static int kvm_debugfs_release(struct inode *inode, struct file *file) return 0; } -static int vm_stat_get_per_vm(void *data, u64 *val) +static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) { - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; + *val = *(u64 *)((void *)(&kvm->stat) + offset); - *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); + return 0; +} + +static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) +{ + *(u64 *)((void *)(&kvm->stat) + offset) = 0; return 0; } -static int vm_stat_clear_per_vm(void *data, u64 val) +static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) { - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; + unsigned long i; + struct kvm_vcpu *vcpu; - if (val) - return -EINVAL; + *val = 0; - *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0; + kvm_for_each_vcpu(i, vcpu, kvm) + *val += *(u64 *)((void *)(&vcpu->stat) + offset); return 0; } -static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) +static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) { - __simple_attr_check_format("%llu\n", 0ull); - return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, - vm_stat_clear_per_vm, "%llu\n"); -} + unsigned long i; + struct kvm_vcpu *vcpu; -static const struct file_operations vm_stat_get_per_vm_fops = { - .owner = THIS_MODULE, - .open = vm_stat_get_per_vm_open, - .release = kvm_debugfs_release, - .read = simple_attr_read, - .write = simple_attr_write, - .llseek = no_llseek, -}; + kvm_for_each_vcpu(i, vcpu, kvm) + *(u64 *)((void *)(&vcpu->stat) + offset) = 0; -static int vcpu_stat_get_per_vm(void *data, u64 *val) -{ - int i; - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; - struct kvm_vcpu *vcpu; + return 0; +} - *val = 0; +static int kvm_stat_data_get(void *data, u64 *val) +{ + int r = -EFAULT; + struct kvm_stat_data *stat_data = data; - kvm_for_each_vcpu(i, vcpu, stat_data->kvm) - *val += *(u64 *)((void *)vcpu + stat_data->offset); + switch (stat_data->kind) { + case KVM_STAT_VM: + r = kvm_get_stat_per_vm(stat_data->kvm, + stat_data->desc->desc.offset, val); + break; + case KVM_STAT_VCPU: + r = kvm_get_stat_per_vcpu(stat_data->kvm, + stat_data->desc->desc.offset, val); + break; + } - return 0; + return r; } -static int vcpu_stat_clear_per_vm(void *data, u64 val) +static int kvm_stat_data_clear(void *data, u64 val) { - int i; - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; - struct kvm_vcpu *vcpu; + int r = -EFAULT; + struct kvm_stat_data *stat_data = data; if (val) return -EINVAL; - kvm_for_each_vcpu(i, vcpu, stat_data->kvm) - *(u64 *)((void *)vcpu + stat_data->offset) = 0; + switch (stat_data->kind) { + case KVM_STAT_VM: + r = kvm_clear_stat_per_vm(stat_data->kvm, + stat_data->desc->desc.offset); + break; + case KVM_STAT_VCPU: + r = kvm_clear_stat_per_vcpu(stat_data->kvm, + stat_data->desc->desc.offset); + break; + } - return 0; + return r; } -static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) +static int kvm_stat_data_open(struct inode *inode, struct file *file) { __simple_attr_check_format("%llu\n", 0ull); - return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, - vcpu_stat_clear_per_vm, "%llu\n"); + return kvm_debugfs_open(inode, file, kvm_stat_data_get, + kvm_stat_data_clear, "%llu\n"); } -static const struct file_operations vcpu_stat_get_per_vm_fops = { - .owner = THIS_MODULE, - .open = vcpu_stat_get_per_vm_open, +static const struct file_operations stat_fops_per_vm = { + .owner = THIS_MODULE, + .open = kvm_stat_data_open, .release = kvm_debugfs_release, - .read = simple_attr_read, - .write = simple_attr_write, - .llseek = no_llseek, -}; - -static const struct file_operations *stat_fops_per_vm[] = { - [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, - [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, + .read = simple_attr_read, + .write = simple_attr_write, }; static int vm_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; u64 tmp_val; *val = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + kvm_get_stat_per_vm(kvm, offset, &tmp_val); *val += tmp_val; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3947,38 +6239,35 @@ static int vm_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; if (val) return -EINVAL; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vm_stat_clear_per_vm((void *)&stat_tmp, 0); + kvm_clear_stat_per_vm(kvm, offset); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n"); static int vcpu_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; u64 tmp_val; *val = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); *val += tmp_val; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3986,28 +6275,22 @@ static int vcpu_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; if (val) return -EINVAL; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); + kvm_clear_stat_per_vcpu(kvm, offset); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, "%llu\n"); - -static const struct file_operations *stat_fops[] = { - [KVM_STAT_VCPU] = &vcpu_stat_fops, - [KVM_STAT_VM] = &vm_stat_fops, -}; +DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n"); static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) { @@ -4017,7 +6300,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (!kvm_dev.this_device || !kvm) return; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); if (type == KVM_EVENT_CREATE_VM) { kvm_createvm_count++; kvm_active_vms++; @@ -4026,7 +6309,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) } created = kvm_createvm_count; active = kvm_active_vms; - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) @@ -4043,7 +6326,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) } add_uevent_var(env, "PID=%d", kvm->userspace_pid); - if (kvm->debugfs_dentry) { + if (!IS_ERR(kvm->debugfs_dentry)) { char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); if (p) { @@ -4061,38 +6344,35 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) static void kvm_init_debug(void) { - struct kvm_stats_debugfs_item *p; + const struct file_operations *fops; + const struct _kvm_stats_desc *pdesc; + int i; kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); - kvm_debugfs_num_entries = 0; - for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - debugfs_create_file(p->name, 0644, kvm_debugfs_dir, - (void *)(long)p->offset, - stat_fops[p->kind]); + for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { + pdesc = &kvm_vm_stats_desc[i]; + if (kvm_stats_debugfs_mode(pdesc) & 0222) + fops = &vm_stat_fops; + else + fops = &vm_stat_readonly_fops; + debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), + kvm_debugfs_dir, + (void *)(long)pdesc->desc.offset, fops); } -} - -static int kvm_suspend(void) -{ - if (kvm_usage_count) - hardware_disable_nolock(NULL); - return 0; -} -static void kvm_resume(void) -{ - if (kvm_usage_count) { - WARN_ON(raw_spin_is_locked(&kvm_count_lock)); - hardware_enable_nolock(NULL); + for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { + pdesc = &kvm_vcpu_stats_desc[i]; + if (kvm_stats_debugfs_mode(pdesc) & 0222) + fops = &vcpu_stat_fops; + else + fops = &vcpu_stat_readonly_fops; + debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), + kvm_debugfs_dir, + (void *)(long)pdesc->desc.offset, fops); } } -static struct syscore_ops kvm_syscore_ops = { - .suspend = kvm_suspend, - .resume = kvm_resume, -}; - static inline struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) { @@ -4103,12 +6383,13 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - if (vcpu->preempted) - vcpu->preempted = false; - - kvm_arch_sched_in(vcpu, cpu); + WRITE_ONCE(vcpu->preempted, false); + WRITE_ONCE(vcpu->ready, false); + __this_cpu_write(kvm_running_vcpu, vcpu); kvm_arch_vcpu_load(vcpu, cpu); + + WRITE_ONCE(vcpu->scheduled_out, false); } static void kvm_sched_out(struct preempt_notifier *pn, @@ -4116,54 +6397,93 @@ static void kvm_sched_out(struct preempt_notifier *pn, { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - if (current->state == TASK_RUNNING) - vcpu->preempted = true; + WRITE_ONCE(vcpu->scheduled_out, true); + + if (task_is_runnable(current) && vcpu->wants_to_run) { + WRITE_ONCE(vcpu->preempted, true); + WRITE_ONCE(vcpu->ready, true); + } kvm_arch_vcpu_put(vcpu); + __this_cpu_write(kvm_running_vcpu, NULL); } -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module) +/** + * kvm_get_running_vcpu - get the vcpu running on the current CPU. + * + * We can disable preemption locally around accessing the per-CPU variable, + * and use the resolved vcpu pointer after enabling preemption again, + * because even if the current thread is migrated to another CPU, reading + * the per-CPU value later will give us the same value as we update the + * per-CPU variable in the preempt notifier handlers. + */ +struct kvm_vcpu *kvm_get_running_vcpu(void) { - int r; - int cpu; + struct kvm_vcpu *vcpu; - r = kvm_arch_init(opaque); - if (r) - goto out_fail; + preempt_disable(); + vcpu = __this_cpu_read(kvm_running_vcpu); + preempt_enable(); - /* - * kvm_arch_init makes sure there's at most one caller - * for architectures that support multiple implementations, - * like intel and amd on x86. - * kvm_arch_init must be called before kvm_irqfd_init to avoid creating - * conflicts in case kvm is already setup for another implementation. - */ - r = kvm_irqfd_init(); - if (r) - goto out_irqfd; + return vcpu; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_running_vcpu); - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { - r = -ENOMEM; - goto out_free_0; - } +/** + * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. + */ +struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) +{ + return &kvm_running_vcpu; +} - r = kvm_arch_hardware_setup(); - if (r < 0) - goto out_free_0a; +#ifdef CONFIG_GUEST_PERF_EVENTS +static unsigned int kvm_guest_state(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + unsigned int state; - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, - kvm_arch_check_processor_compat, - &r, 1); - if (r < 0) - goto out_free_1; - } + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", - kvm_starting_cpu, kvm_dying_cpu); - if (r) - goto out_free_2; - register_reboot_notifier(&kvm_reboot_notifier); + state = PERF_GUEST_ACTIVE; + if (!kvm_arch_vcpu_in_kernel(vcpu)) + state |= PERF_GUEST_USER; + + return state; +} + +static unsigned long kvm_guest_get_ip(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ + if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu))) + return 0; + + return kvm_arch_vcpu_get_ip(vcpu); +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .state = kvm_guest_state, + .get_ip = kvm_guest_get_ip, + .handle_intel_pt_intr = NULL, +}; + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) +{ + kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; + perf_register_guest_info_callbacks(&kvm_guest_cbs); +} +void kvm_unregister_perf_callbacks(void) +{ + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); +} +#endif + +int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) +{ + int r; + int cpu; /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) @@ -4172,28 +6492,32 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, SLAB_ACCOUNT, offsetof(struct kvm_vcpu, arch), - sizeof_field(struct kvm_vcpu, arch), + offsetofend(struct kvm_vcpu, stats_id) + - offsetof(struct kvm_vcpu, arch), NULL); - if (!kvm_vcpu_cache) { - r = -ENOMEM; - goto out_free_3; + if (!kvm_vcpu_cache) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu))) { + r = -ENOMEM; + goto err_cpu_kick_mask; + } } + r = kvm_irqfd_init(); + if (r) + goto err_irqfd; + r = kvm_async_pf_init(); if (r) - goto out_free; + goto err_async_pf; kvm_chardev_ops.owner = module; kvm_vm_fops.owner = module; kvm_vcpu_fops.owner = module; - - r = misc_register(&kvm_dev); - if (r) { - pr_err("kvm: misc device register failed\n"); - goto out_unreg; - } - - register_syscore_ops(&kvm_syscore_ops); + kvm_device_fops.owner = module; kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; @@ -4201,45 +6525,68 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, kvm_init_debug(); r = kvm_vfio_ops_init(); - WARN_ON(r); + if (WARN_ON_ONCE(r)) + goto err_vfio; + + r = kvm_gmem_init(module); + if (r) + goto err_gmem; + + r = kvm_init_virtualization(); + if (r) + goto err_virt; + + /* + * Registration _must_ be the very last thing done, as this exposes + * /dev/kvm to userspace, i.e. all infrastructure must be setup! + */ + r = misc_register(&kvm_dev); + if (r) { + pr_err("kvm: misc device register failed\n"); + goto err_register; + } return 0; -out_unreg: +err_register: + kvm_uninit_virtualization(); +err_virt: + kvm_gmem_exit(); +err_gmem: + kvm_vfio_ops_exit(); +err_vfio: kvm_async_pf_deinit(); -out_free: - kmem_cache_destroy(kvm_vcpu_cache); -out_free_3: - unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); -out_free_2: -out_free_1: - kvm_arch_hardware_unsetup(); -out_free_0a: - free_cpumask_var(cpus_hardware_enabled); -out_free_0: +err_async_pf: kvm_irqfd_exit(); -out_irqfd: - kvm_arch_exit(); -out_fail: +err_irqfd: +err_cpu_kick_mask: + for_each_possible_cpu(cpu) + free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); + kmem_cache_destroy(kvm_vcpu_cache); return r; } -EXPORT_SYMBOL_GPL(kvm_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init); void kvm_exit(void) { - debugfs_remove_recursive(kvm_debugfs_dir); + int cpu; + + /* + * Note, unregistering /dev/kvm doesn't strictly need to come first, + * fops_get(), a.k.a. try_module_get(), prevents acquiring references + * to KVM while the module is being stopped. + */ misc_deregister(&kvm_dev); + + kvm_uninit_virtualization(); + + debugfs_remove_recursive(kvm_debugfs_dir); + for_each_possible_cpu(cpu) + free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); + kvm_gmem_exit(); + kvm_vfio_ops_exit(); kvm_async_pf_deinit(); - unregister_syscore_ops(&kvm_syscore_ops); - unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); - on_each_cpu(hardware_disable_nolock, NULL, 1); - kvm_arch_hardware_unsetup(); - kvm_arch_exit(); kvm_irqfd_exit(); - free_cpumask_var(cpus_hardware_enabled); - kvm_vfio_ops_exit(); } -EXPORT_SYMBOL_GPL(kvm_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_exit); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h new file mode 100644 index 000000000000..9fcc5d5b7f8d --- /dev/null +++ b/virt/kvm/kvm_mm.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __KVM_MM_H__ +#define __KVM_MM_H__ 1 + +/* + * Architectures can choose whether to use an rwlock or spinlock + * for the mmu_lock. These macros, for use in common code + * only, avoids using #ifdefs in places that must deal with + * multiple architectures. + */ + +#ifdef KVM_HAVE_MMU_RWLOCK +#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) write_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) write_unlock(&(kvm)->mmu_lock) +#else +#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) +#endif /* KVM_HAVE_MMU_RWLOCK */ + + +struct kvm_follow_pfn { + const struct kvm_memory_slot *slot; + const gfn_t gfn; + + unsigned long hva; + + /* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */ + unsigned int flags; + + /* + * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag). + * The page *must* be pinned if KVM will write to the page via a kernel + * mapping, e.g. via kmap(), mremap(), etc. + */ + bool pin; + + /* + * If non-NULL, try to get a writable mapping even for a read fault. + * Set to true if a writable mapping was obtained. + */ + bool *map_writable; + + /* + * Optional output. Set to a valid "struct page" if the returned pfn + * is for a refcounted or pinned struct page, NULL if the returned pfn + * has no struct page or if the struct page is not being refcounted + * (e.g. tail pages of non-compound higher order allocations from + * IO/PFNMAP mappings). + */ + struct page **refcounted_page; +}; + +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp); + +#ifdef CONFIG_HAVE_KVM_PFNCACHE +void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, + unsigned long start, + unsigned long end); +#else +static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, + unsigned long start, + unsigned long end) +{ +} +#endif /* HAVE_KVM_PFNCACHE */ + +#ifdef CONFIG_KVM_GUEST_MEMFD +int kvm_gmem_init(struct module *module); +void kvm_gmem_exit(void); +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned int fd, loff_t offset); +void kvm_gmem_unbind(struct kvm_memory_slot *slot); +#else +static inline int kvm_gmem_init(struct module *module) +{ + return 0; +} +static inline void kvm_gmem_exit(void) {}; +static inline int kvm_gmem_bind(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned int fd, loff_t offset) +{ + WARN_ON_ONCE(1); + return -EIO; +} + +static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot) +{ + WARN_ON_ONCE(1); +} +#endif /* CONFIG_KVM_GUEST_MEMFD */ + +#endif /* __KVM_MM_H__ */ diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c new file mode 100644 index 000000000000..728d2c1b488a --- /dev/null +++ b/virt/kvm/pfncache.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables kernel and guest-mode vCPU access to guest physical + * memory with suitable invalidation mechanisms. + * + * Copyright © 2021 Amazon.com, Inc. or its affiliates. + * + * Authors: + * David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/errno.h> + +#include "kvm_mm.h" + +/* + * MMU notifier 'invalidate_range_start' hook. + */ +void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + struct gfn_to_pfn_cache *gpc; + + spin_lock(&kvm->gpc_lock); + list_for_each_entry(gpc, &kvm->gpc_list, list) { + read_lock_irq(&gpc->lock); + + /* Only a single page so no need to care about length */ + if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && + gpc->uhva >= start && gpc->uhva < end) { + read_unlock_irq(&gpc->lock); + + /* + * There is a small window here where the cache could + * be modified, and invalidation would no longer be + * necessary. Hence check again whether invalidation + * is still necessary once the write lock has been + * acquired. + */ + + write_lock_irq(&gpc->lock); + if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && + gpc->uhva >= start && gpc->uhva < end) + gpc->valid = false; + write_unlock_irq(&gpc->lock); + continue; + } + + read_unlock_irq(&gpc->lock); + } + spin_unlock(&kvm->gpc_lock); +} + +static bool kvm_gpc_is_valid_len(gpa_t gpa, unsigned long uhva, + unsigned long len) +{ + unsigned long offset = kvm_is_error_gpa(gpa) ? offset_in_page(uhva) : + offset_in_page(gpa); + + /* + * The cached access must fit within a single page. The 'len' argument + * to activate() and refresh() exists only to enforce that. + */ + return offset + len <= PAGE_SIZE; +} + +bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) +{ + struct kvm_memslots *slots = kvm_memslots(gpc->kvm); + + if (!gpc->active) + return false; + + /* + * If the page was cached from a memslot, make sure the memslots have + * not been re-configured. + */ + if (!kvm_is_error_gpa(gpc->gpa) && gpc->generation != slots->generation) + return false; + + if (kvm_is_error_hva(gpc->uhva)) + return false; + + if (!kvm_gpc_is_valid_len(gpc->gpa, gpc->uhva, len)) + return false; + + if (!gpc->valid) + return false; + + return true; +} + +static void *gpc_map(kvm_pfn_t pfn) +{ + if (pfn_valid(pfn)) + return kmap(pfn_to_page(pfn)); + +#ifdef CONFIG_HAS_IOMEM + return memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); +#else + return NULL; +#endif +} + +static void gpc_unmap(kvm_pfn_t pfn, void *khva) +{ + /* Unmap the old pfn/page if it was mapped before. */ + if (is_error_noslot_pfn(pfn) || !khva) + return; + + if (pfn_valid(pfn)) { + kunmap(pfn_to_page(pfn)); + return; + } + +#ifdef CONFIG_HAS_IOMEM + memunmap(khva); +#endif +} + +static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq) +{ + /* + * mn_active_invalidate_count acts for all intents and purposes + * like mmu_invalidate_in_progress here; but the latter cannot + * be used here because the invalidation of caches in the + * mmu_notifier event occurs _before_ mmu_invalidate_in_progress + * is elevated. + * + * Note, it does not matter that mn_active_invalidate_count + * is not protected by gpc->lock. It is guaranteed to + * be elevated before the mmu_notifier acquires gpc->lock, and + * isn't dropped until after mmu_invalidate_seq is updated. + */ + if (kvm->mn_active_invalidate_count) + return true; + + /* + * Ensure mn_active_invalidate_count is read before + * mmu_invalidate_seq. This pairs with the smp_wmb() in + * mmu_notifier_invalidate_range_end() to guarantee either the + * old (non-zero) value of mn_active_invalidate_count or the + * new (incremented) value of mmu_invalidate_seq is observed. + */ + smp_rmb(); + return kvm->mmu_invalidate_seq != mmu_seq; +} + +static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) +{ + /* Note, the new page offset may be different than the old! */ + void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); + kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; + void *new_khva = NULL; + unsigned long mmu_seq; + struct page *page; + + struct kvm_follow_pfn kfp = { + .slot = gpc->memslot, + .gfn = gpa_to_gfn(gpc->gpa), + .flags = FOLL_WRITE, + .hva = gpc->uhva, + .refcounted_page = &page, + }; + + lockdep_assert_held(&gpc->refresh_lock); + + lockdep_assert_held_write(&gpc->lock); + + /* + * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva + * assets have already been updated and so a concurrent check() from a + * different task may not fail the gpa/uhva/generation checks. + */ + gpc->valid = false; + + do { + mmu_seq = gpc->kvm->mmu_invalidate_seq; + smp_rmb(); + + write_unlock_irq(&gpc->lock); + + /* + * If the previous iteration "failed" due to an mmu_notifier + * event, release the pfn and unmap the kernel virtual address + * from the previous attempt. Unmapping might sleep, so this + * needs to be done after dropping the lock. Opportunistically + * check for resched while the lock isn't held. + */ + if (new_pfn != KVM_PFN_ERR_FAULT) { + /* + * Keep the mapping if the previous iteration reused + * the existing mapping and didn't create a new one. + */ + if (new_khva != old_khva) + gpc_unmap(new_pfn, new_khva); + + kvm_release_page_unused(page); + + cond_resched(); + } + + new_pfn = hva_to_pfn(&kfp); + if (is_error_noslot_pfn(new_pfn)) + goto out_error; + + /* + * Obtain a new kernel mapping if KVM itself will access the + * pfn. Note, kmap() and memremap() can both sleep, so this + * too must be done outside of gpc->lock! + */ + if (new_pfn == gpc->pfn) + new_khva = old_khva; + else + new_khva = gpc_map(new_pfn); + + if (!new_khva) { + kvm_release_page_unused(page); + goto out_error; + } + + write_lock_irq(&gpc->lock); + + /* + * Other tasks must wait for _this_ refresh to complete before + * attempting to refresh. + */ + WARN_ON_ONCE(gpc->valid); + } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq)); + + gpc->valid = true; + gpc->pfn = new_pfn; + gpc->khva = new_khva + offset_in_page(gpc->uhva); + + /* + * Put the reference to the _new_ page. The page is now tracked by the + * cache and can be safely migrated, swapped, etc... as the cache will + * invalidate any mappings in response to relevant mmu_notifier events. + */ + kvm_release_page_clean(page); + + return 0; + +out_error: + write_lock_irq(&gpc->lock); + + return -EFAULT; +} + +static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva) +{ + unsigned long page_offset; + bool unmap_old = false; + unsigned long old_uhva; + kvm_pfn_t old_pfn; + bool hva_change = false; + void *old_khva; + int ret; + + /* Either gpa or uhva must be valid, but not both */ + if (WARN_ON_ONCE(kvm_is_error_gpa(gpa) == kvm_is_error_hva(uhva))) + return -EINVAL; + + lockdep_assert_held(&gpc->refresh_lock); + + write_lock_irq(&gpc->lock); + + if (!gpc->active) { + ret = -EINVAL; + goto out_unlock; + } + + old_pfn = gpc->pfn; + old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); + old_uhva = PAGE_ALIGN_DOWN(gpc->uhva); + + if (kvm_is_error_gpa(gpa)) { + page_offset = offset_in_page(uhva); + + gpc->gpa = INVALID_GPA; + gpc->memslot = NULL; + gpc->uhva = PAGE_ALIGN_DOWN(uhva); + + if (gpc->uhva != old_uhva) + hva_change = true; + } else { + struct kvm_memslots *slots = kvm_memslots(gpc->kvm); + + page_offset = offset_in_page(gpa); + + if (gpc->gpa != gpa || gpc->generation != slots->generation || + kvm_is_error_hva(gpc->uhva)) { + gfn_t gfn = gpa_to_gfn(gpa); + + gpc->gpa = gpa; + gpc->generation = slots->generation; + gpc->memslot = __gfn_to_memslot(slots, gfn); + gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn); + + if (kvm_is_error_hva(gpc->uhva)) { + ret = -EFAULT; + goto out; + } + + /* + * Even if the GPA and/or the memslot generation changed, the + * HVA may still be the same. + */ + if (gpc->uhva != old_uhva) + hva_change = true; + } else { + gpc->uhva = old_uhva; + } + } + + /* Note: the offset must be correct before calling hva_to_pfn_retry() */ + gpc->uhva += page_offset; + + /* + * If the userspace HVA changed or the PFN was already invalid, + * drop the lock and do the HVA to PFN lookup again. + */ + if (!gpc->valid || hva_change) { + ret = hva_to_pfn_retry(gpc); + } else { + /* + * If the HVA→PFN mapping was already valid, don't unmap it. + * But do update gpc->khva because the offset within the page + * may have changed. + */ + gpc->khva = old_khva + page_offset; + ret = 0; + goto out_unlock; + } + + out: + /* + * Invalidate the cache and purge the pfn/khva if the refresh failed. + * Some/all of the uhva, gpa, and memslot generation info may still be + * valid, leave it as is. + */ + if (ret) { + gpc->valid = false; + gpc->pfn = KVM_PFN_ERR_FAULT; + gpc->khva = NULL; + } + + /* Detect a pfn change before dropping the lock! */ + unmap_old = (old_pfn != gpc->pfn); + +out_unlock: + write_unlock_irq(&gpc->lock); + + if (unmap_old) + gpc_unmap(old_pfn, old_khva); + + return ret; +} + +int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) +{ + unsigned long uhva; + + guard(mutex)(&gpc->refresh_lock); + + if (!kvm_gpc_is_valid_len(gpc->gpa, gpc->uhva, len)) + return -EINVAL; + + /* + * If the GPA is valid then ignore the HVA, as a cache can be GPA-based + * or HVA-based, not both. For GPA-based caches, the HVA will be + * recomputed during refresh if necessary. + */ + uhva = kvm_is_error_gpa(gpc->gpa) ? gpc->uhva : KVM_HVA_ERR_BAD; + + return __kvm_gpc_refresh(gpc, gpc->gpa, uhva); +} + +void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm) +{ + rwlock_init(&gpc->lock); + mutex_init(&gpc->refresh_lock); + + gpc->kvm = kvm; + gpc->pfn = KVM_PFN_ERR_FAULT; + gpc->gpa = INVALID_GPA; + gpc->uhva = KVM_HVA_ERR_BAD; + gpc->active = gpc->valid = false; +} + +static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva, + unsigned long len) +{ + struct kvm *kvm = gpc->kvm; + + if (!kvm_gpc_is_valid_len(gpa, uhva, len)) + return -EINVAL; + + guard(mutex)(&gpc->refresh_lock); + + if (!gpc->active) { + if (KVM_BUG_ON(gpc->valid, kvm)) + return -EIO; + + spin_lock(&kvm->gpc_lock); + list_add(&gpc->list, &kvm->gpc_list); + spin_unlock(&kvm->gpc_lock); + + /* + * Activate the cache after adding it to the list, a concurrent + * refresh must not establish a mapping until the cache is + * reachable by mmu_notifier events. + */ + write_lock_irq(&gpc->lock); + gpc->active = true; + write_unlock_irq(&gpc->lock); + } + return __kvm_gpc_refresh(gpc, gpa, uhva); +} + +int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) +{ + /* + * Explicitly disallow INVALID_GPA so that the magic value can be used + * by KVM to differentiate between GPA-based and HVA-based caches. + */ + if (WARN_ON_ONCE(kvm_is_error_gpa(gpa))) + return -EINVAL; + + return __kvm_gpc_activate(gpc, gpa, KVM_HVA_ERR_BAD, len); +} + +int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long uhva, unsigned long len) +{ + if (!access_ok((void __user *)uhva, len)) + return -EINVAL; + + return __kvm_gpc_activate(gpc, INVALID_GPA, uhva, len); +} + +void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) +{ + struct kvm *kvm = gpc->kvm; + kvm_pfn_t old_pfn; + void *old_khva; + + guard(mutex)(&gpc->refresh_lock); + + if (gpc->active) { + /* + * Deactivate the cache before removing it from the list, KVM + * must stall mmu_notifier events until all users go away, i.e. + * until gpc->lock is dropped and refresh is guaranteed to fail. + */ + write_lock_irq(&gpc->lock); + gpc->active = false; + gpc->valid = false; + + /* + * Leave the GPA => uHVA cache intact, it's protected by the + * memslot generation. The PFN lookup needs to be redone every + * time as mmu_notifier protection is lost when the cache is + * removed from the VM's gpc_list. + */ + old_khva = gpc->khva - offset_in_page(gpc->khva); + gpc->khva = NULL; + + old_pfn = gpc->pfn; + gpc->pfn = KVM_PFN_ERR_FAULT; + write_unlock_irq(&gpc->lock); + + spin_lock(&kvm->gpc_lock); + list_del(&gpc->list); + spin_unlock(&kvm->gpc_lock); + + gpc_unmap(old_pfn, old_khva); + } +} diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index d99850c462a1..be50514bbd11 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * VFIO-KVM bridge pseudo device * * Copyright (C) 2013 Red Hat, Inc. All rights reserved. * Author: Alex Williamson <alex.williamson@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/errno.h> @@ -24,149 +21,110 @@ #include <asm/kvm_ppc.h> #endif -struct kvm_vfio_group { +struct kvm_vfio_file { struct list_head node; - struct vfio_group *vfio_group; + struct file *file; +#ifdef CONFIG_SPAPR_TCE_IOMMU + struct iommu_group *iommu_group; +#endif }; struct kvm_vfio { - struct list_head group_list; + struct list_head file_list; struct mutex lock; bool noncoherent; }; -static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep) +static void kvm_vfio_file_set_kvm(struct file *file, struct kvm *kvm) { - struct vfio_group *vfio_group; - struct vfio_group *(*fn)(struct file *); + void (*fn)(struct file *file, struct kvm *kvm); - fn = symbol_get(vfio_group_get_external_user); + fn = symbol_get(vfio_file_set_kvm); if (!fn) - return ERR_PTR(-EINVAL); - - vfio_group = fn(filep); + return; - symbol_put(vfio_group_get_external_user); + fn(file, kvm); - return vfio_group; + symbol_put(vfio_file_set_kvm); } -static bool kvm_vfio_external_group_match_file(struct vfio_group *group, - struct file *filep) +static bool kvm_vfio_file_enforced_coherent(struct file *file) { - bool ret, (*fn)(struct vfio_group *, struct file *); + bool (*fn)(struct file *file); + bool ret; - fn = symbol_get(vfio_external_group_match_file); + fn = symbol_get(vfio_file_enforced_coherent); if (!fn) return false; - ret = fn(group, filep); + ret = fn(file); - symbol_put(vfio_external_group_match_file); + symbol_put(vfio_file_enforced_coherent); return ret; } -static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) -{ - void (*fn)(struct vfio_group *); - - fn = symbol_get(vfio_group_put_external_user); - if (!fn) - return; - - fn(vfio_group); - - symbol_put(vfio_group_put_external_user); -} - -static void kvm_vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) +static bool kvm_vfio_file_is_valid(struct file *file) { - void (*fn)(struct vfio_group *, struct kvm *); + bool (*fn)(struct file *file); + bool ret; - fn = symbol_get(vfio_group_set_kvm); - if (!fn) - return; - - fn(group, kvm); - - symbol_put(vfio_group_set_kvm); -} - -static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group) -{ - long (*fn)(struct vfio_group *, unsigned long); - long ret; - - fn = symbol_get(vfio_external_check_extension); + fn = symbol_get(vfio_file_is_valid); if (!fn) return false; - ret = fn(vfio_group, VFIO_DMA_CC_IOMMU); + ret = fn(file); - symbol_put(vfio_external_check_extension); + symbol_put(vfio_file_is_valid); - return ret > 0; + return ret; } #ifdef CONFIG_SPAPR_TCE_IOMMU -static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group) +static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) { - int (*fn)(struct vfio_group *); - int ret = -EINVAL; + struct iommu_group *(*fn)(struct file *file); + struct iommu_group *ret; - fn = symbol_get(vfio_external_user_iommu_id); + fn = symbol_get(vfio_file_iommu_group); if (!fn) - return ret; + return NULL; - ret = fn(vfio_group); + ret = fn(file); - symbol_put(vfio_external_user_iommu_id); + symbol_put(vfio_file_iommu_group); return ret; } -static struct iommu_group *kvm_vfio_group_get_iommu_group( - struct vfio_group *group) -{ - int group_id = kvm_vfio_external_user_iommu_id(group); - - if (group_id < 0) - return NULL; - - return iommu_group_get_by_id(group_id); -} - static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, - struct vfio_group *vfio_group) + struct kvm_vfio_file *kvf) { - struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group); - - if (WARN_ON_ONCE(!grp)) + if (WARN_ON_ONCE(!kvf->iommu_group)) return; - kvm_spapr_tce_release_iommu_group(kvm, grp); - iommu_group_put(grp); + kvm_spapr_tce_release_iommu_group(kvm, kvf->iommu_group); + iommu_group_put(kvf->iommu_group); + kvf->iommu_group = NULL; } #endif /* - * Groups can use the same or different IOMMU domains. If the same then - * adding a new group may change the coherency of groups we've previously - * been told about. We don't want to care about any of that so we retest - * each group and bail as soon as we find one that's noncoherent. This - * means we only ever [un]register_noncoherent_dma once for the whole device. + * Groups/devices can use the same or different IOMMU domains. If the same + * then adding a new group/device may change the coherency of groups/devices + * we've previously been told about. We don't want to care about any of + * that so we retest each group/device and bail as soon as we find one that's + * noncoherent. This means we only ever [un]register_noncoherent_dma once + * for the whole device. */ static void kvm_vfio_update_coherency(struct kvm_device *dev) { struct kvm_vfio *kv = dev->private; bool noncoherent = false; - struct kvm_vfio_group *kvg; - - mutex_lock(&kv->lock); + struct kvm_vfio_file *kvf; - list_for_each_entry(kvg, &kv->group_list, node) { - if (!kvm_vfio_group_is_coherent(kvg->vfio_group)) { + list_for_each_entry(kvf, &kv->file_list, node) { + if (!kvm_vfio_file_enforced_coherent(kvf->file)) { noncoherent = true; break; } @@ -180,153 +138,152 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev) else kvm_arch_unregister_noncoherent_dma(dev->kvm); } - - mutex_unlock(&kv->lock); } -static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) +static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; - struct vfio_group *vfio_group; - struct kvm_vfio_group *kvg; - int32_t __user *argp = (int32_t __user *)(unsigned long)arg; - struct fd f; - int32_t fd; - int ret; - - switch (attr) { - case KVM_DEV_VFIO_GROUP_ADD: - if (get_user(fd, argp)) - return -EFAULT; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); - - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); - - mutex_lock(&kv->lock); + struct kvm_vfio_file *kvf; + struct file *filp; + int ret = 0; + + filp = fget(fd); + if (!filp) + return -EBADF; + + /* Ensure the FD is a vfio FD. */ + if (!kvm_vfio_file_is_valid(filp)) { + ret = -EINVAL; + goto out_fput; + } - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group == vfio_group) { - mutex_unlock(&kv->lock); - kvm_vfio_group_put_external_user(vfio_group); - return -EEXIST; - } - } + mutex_lock(&kv->lock); - kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); - if (!kvg) { - mutex_unlock(&kv->lock); - kvm_vfio_group_put_external_user(vfio_group); - return -ENOMEM; + list_for_each_entry(kvf, &kv->file_list, node) { + if (kvf->file == filp) { + ret = -EEXIST; + goto out_unlock; } + } - list_add_tail(&kvg->node, &kv->group_list); - kvg->vfio_group = vfio_group; - - kvm_arch_start_assignment(dev->kvm); - - mutex_unlock(&kv->lock); + kvf = kzalloc(sizeof(*kvf), GFP_KERNEL_ACCOUNT); + if (!kvf) { + ret = -ENOMEM; + goto out_unlock; + } - kvm_vfio_group_set_kvm(vfio_group, dev->kvm); + kvf->file = get_file(filp); + list_add_tail(&kvf->node, &kv->file_list); - kvm_vfio_update_coherency(dev); + kvm_vfio_file_set_kvm(kvf->file, dev->kvm); + kvm_vfio_update_coherency(dev); - return 0; +out_unlock: + mutex_unlock(&kv->lock); +out_fput: + fput(filp); + return ret; +} - case KVM_DEV_VFIO_GROUP_DEL: - if (get_user(fd, argp)) - return -EFAULT; +static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd) +{ + struct kvm_vfio *kv = dev->private; + struct kvm_vfio_file *kvf; + CLASS(fd, f)(fd); + int ret; - f = fdget(fd); - if (!f.file) - return -EBADF; + if (fd_empty(f)) + return -EBADF; - ret = -ENOENT; + ret = -ENOENT; - mutex_lock(&kv->lock); + mutex_lock(&kv->lock); - list_for_each_entry(kvg, &kv->group_list, node) { - if (!kvm_vfio_external_group_match_file(kvg->vfio_group, - f.file)) - continue; + list_for_each_entry(kvf, &kv->file_list, node) { + if (kvf->file != fd_file(f)) + continue; - list_del(&kvg->node); - kvm_arch_end_assignment(dev->kvm); + list_del(&kvf->node); #ifdef CONFIG_SPAPR_TCE_IOMMU - kvm_spapr_tce_release_vfio_group(dev->kvm, - kvg->vfio_group); + kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif - kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); - kvm_vfio_group_put_external_user(kvg->vfio_group); - kfree(kvg); - ret = 0; - break; - } + kvm_vfio_file_set_kvm(kvf->file, NULL); + fput(kvf->file); + kfree(kvf); + ret = 0; + break; + } - mutex_unlock(&kv->lock); + kvm_vfio_update_coherency(dev); - fdput(f); + mutex_unlock(&kv->lock); + return ret; +} - kvm_vfio_update_coherency(dev); +#ifdef CONFIG_SPAPR_TCE_IOMMU +static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev, + void __user *arg) +{ + struct kvm_vfio_spapr_tce param; + struct kvm_vfio *kv = dev->private; + struct kvm_vfio_file *kvf; + int ret; - return ret; + if (copy_from_user(¶m, arg, sizeof(struct kvm_vfio_spapr_tce))) + return -EFAULT; -#ifdef CONFIG_SPAPR_TCE_IOMMU - case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: { - struct kvm_vfio_spapr_tce param; - struct kvm_vfio *kv = dev->private; - struct vfio_group *vfio_group; - struct kvm_vfio_group *kvg; - struct fd f; - struct iommu_group *grp; - - if (copy_from_user(¶m, (void __user *)arg, - sizeof(struct kvm_vfio_spapr_tce))) - return -EFAULT; + CLASS(fd, f)(param.groupfd); + if (fd_empty(f)) + return -EBADF; - f = fdget(param.groupfd); - if (!f.file) - return -EBADF; + ret = -ENOENT; - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); + mutex_lock(&kv->lock); - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); + list_for_each_entry(kvf, &kv->file_list, node) { + if (kvf->file != fd_file(f)) + continue; - grp = kvm_vfio_group_get_iommu_group(vfio_group); - if (WARN_ON_ONCE(!grp)) { - kvm_vfio_group_put_external_user(vfio_group); - return -EIO; + if (!kvf->iommu_group) { + kvf->iommu_group = kvm_vfio_file_iommu_group(kvf->file); + if (WARN_ON_ONCE(!kvf->iommu_group)) { + ret = -EIO; + goto err_fdput; + } } - ret = -ENOENT; - - mutex_lock(&kv->lock); + ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, + kvf->iommu_group); + break; + } - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group != vfio_group) - continue; +err_fdput: + mutex_unlock(&kv->lock); + return ret; +} +#endif - ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, - param.tablefd, grp); - break; - } +static int kvm_vfio_set_file(struct kvm_device *dev, long attr, + void __user *arg) +{ + int32_t __user *argp = arg; + int32_t fd; - mutex_unlock(&kv->lock); + switch (attr) { + case KVM_DEV_VFIO_FILE_ADD: + if (get_user(fd, argp)) + return -EFAULT; + return kvm_vfio_file_add(dev, fd); - iommu_group_put(grp); - kvm_vfio_group_put_external_user(vfio_group); + case KVM_DEV_VFIO_FILE_DEL: + if (get_user(fd, argp)) + return -EFAULT; + return kvm_vfio_file_del(dev, fd); - return ret; - } -#endif /* CONFIG_SPAPR_TCE_IOMMU */ +#ifdef CONFIG_SPAPR_TCE_IOMMU + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: + return kvm_vfio_file_set_spapr_tce(dev, arg); +#endif } return -ENXIO; @@ -336,8 +293,9 @@ static int kvm_vfio_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { - case KVM_DEV_VFIO_GROUP: - return kvm_vfio_set_group(dev, attr->attr, attr->addr); + case KVM_DEV_VFIO_FILE: + return kvm_vfio_set_file(dev, attr->attr, + u64_to_user_ptr(attr->addr)); } return -ENXIO; @@ -347,10 +305,10 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { - case KVM_DEV_VFIO_GROUP: + case KVM_DEV_VFIO_FILE: switch (attr->attr) { - case KVM_DEV_VFIO_GROUP_ADD: - case KVM_DEV_VFIO_GROUP_DEL: + case KVM_DEV_VFIO_FILE_ADD: + case KVM_DEV_VFIO_FILE_DEL: #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: #endif @@ -363,34 +321,33 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, return -ENXIO; } -static void kvm_vfio_destroy(struct kvm_device *dev) +static void kvm_vfio_release(struct kvm_device *dev) { struct kvm_vfio *kv = dev->private; - struct kvm_vfio_group *kvg, *tmp; + struct kvm_vfio_file *kvf, *tmp; - list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { + list_for_each_entry_safe(kvf, tmp, &kv->file_list, node) { #ifdef CONFIG_SPAPR_TCE_IOMMU - kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group); + kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif - kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); - kvm_vfio_group_put_external_user(kvg->vfio_group); - list_del(&kvg->node); - kfree(kvg); - kvm_arch_end_assignment(dev->kvm); + kvm_vfio_file_set_kvm(kvf->file, NULL); + fput(kvf->file); + list_del(&kvf->node); + kfree(kvf); } kvm_vfio_update_coherency(dev); kfree(kv); - kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ + kfree(dev); /* alloc by kvm_ioctl_create_device, free by .release */ } static int kvm_vfio_create(struct kvm_device *dev, u32 type); -static struct kvm_device_ops kvm_vfio_ops = { +static const struct kvm_device_ops kvm_vfio_ops = { .name = "kvm-vfio", .create = kvm_vfio_create, - .destroy = kvm_vfio_destroy, + .release = kvm_vfio_release, .set_attr = kvm_vfio_set_attr, .has_attr = kvm_vfio_has_attr, }; @@ -400,16 +357,18 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type) struct kvm_device *tmp; struct kvm_vfio *kv; + lockdep_assert_held(&dev->kvm->lock); + /* Only one VFIO "device" per VM */ list_for_each_entry(tmp, &dev->kvm->devices, vm_node) if (tmp->ops == &kvm_vfio_ops) return -EBUSY; - kv = kzalloc(sizeof(*kv), GFP_KERNEL); + kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT); if (!kv) return -ENOMEM; - INIT_LIST_HEAD(&kv->group_list); + INIT_LIST_HEAD(&kv->file_list); mutex_init(&kv->lock); dev->private = kv; |
