diff options
Diffstat (limited to 'arch/riscv/kvm')
33 files changed, 9028 insertions, 1569 deletions
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index f36a737d5f96..77379f77840a 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -18,16 +18,21 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION config KVM - tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" + tristate "Kernel-based Virtual Machine (KVM) support" depends on RISCV_SBI && MMU - select MMU_NOTIFIER - select PREEMPT_NOTIFIERS - select KVM_MMIO + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_MSI + select HAVE_KVM_READONLY_MEM + select HAVE_KVM_DIRTY_RING_ACQ_REL + select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select KVM_XFER_TO_GUEST_WORK - select HAVE_KVM_VCPU_ASYNC_IOCTL - select HAVE_KVM_EVENTFD - select SRCU + select KVM_GENERIC_HARDWARE_ENABLING + select KVM_MMIO + select VIRT_XFER_TO_GUEST_WORK + select KVM_GENERIC_MMU_NOTIFIER + select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS help Support hosting virtualized guest machines. diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index 019df9208bdd..3b8afb038b35 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -3,25 +3,40 @@ # Makefile for RISC-V KVM support # -ccflags-y += -I $(srctree)/$(src) +ccflags-y += -I $(src) include $(srctree)/virt/kvm/Makefile.kvm obj-$(CONFIG_KVM) += kvm.o +# Ordered alphabetically +kvm-y += aia.o +kvm-y += aia_aplic.o +kvm-y += aia_device.o +kvm-y += aia_imsic.o +kvm-y += gstage.o kvm-y += main.o -kvm-y += vm.o -kvm-y += vmid.o -kvm-y += tlb.o kvm-y += mmu.o +kvm-y += nacl.o +kvm-y += tlb.o kvm-y += vcpu.o kvm-y += vcpu_exit.o kvm-y += vcpu_fp.o kvm-y += vcpu_insn.o -kvm-y += vcpu_switch.o +kvm-y += vcpu_onereg.o +kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o kvm-y += vcpu_sbi.o -kvm-$(CONFIG_RISCV_SBI_V01) += vcpu_sbi_v01.o kvm-y += vcpu_sbi_base.o -kvm-y += vcpu_sbi_replace.o +kvm-y += vcpu_sbi_forward.o +kvm-y += vcpu_sbi_fwft.o kvm-y += vcpu_sbi_hsm.o +kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_sbi_pmu.o +kvm-y += vcpu_sbi_replace.o +kvm-y += vcpu_sbi_sta.o +kvm-y += vcpu_sbi_system.o +kvm-$(CONFIG_RISCV_SBI_V01) += vcpu_sbi_v01.o +kvm-y += vcpu_switch.o kvm-y += vcpu_timer.o +kvm-y += vcpu_vector.o +kvm-y += vm.o +kvm-y += vmid.o diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c new file mode 100644 index 000000000000..dad318185660 --- /dev/null +++ b/arch/riscv/kvm/aia.c @@ -0,0 +1,671 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + * + * Authors: + * Anup Patel <apatel@ventanamicro.com> + */ + +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/irq.h> +#include <linux/irqchip/riscv-imsic.h> +#include <linux/irqdomain.h> +#include <linux/kvm_host.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <asm/cpufeature.h> +#include <asm/kvm_nacl.h> + +struct aia_hgei_control { + raw_spinlock_t lock; + unsigned long free_bitmap; + struct kvm_vcpu *owners[BITS_PER_LONG]; +}; +static DEFINE_PER_CPU(struct aia_hgei_control, aia_hgei); +static int hgei_parent_irq; + +unsigned int kvm_riscv_aia_nr_hgei; +unsigned int kvm_riscv_aia_max_ids; +DEFINE_STATIC_KEY_FALSE(kvm_riscv_aia_available); + +static inline unsigned long aia_hvictl_value(bool ext_irq_pending) +{ + unsigned long hvictl; + + /* + * HVICTL.IID == 9 and HVICTL.IPRIO == 0 represents + * no interrupt in HVICTL. + */ + + hvictl = (IRQ_S_EXT << HVICTL_IID_SHIFT) & HVICTL_IID; + hvictl |= ext_irq_pending; + return hvictl; +} + +#ifdef CONFIG_32BIT +void kvm_riscv_vcpu_aia_flush_interrupts(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + unsigned long mask, val; + + if (!kvm_riscv_aia_available()) + return; + + if (READ_ONCE(vcpu->arch.irqs_pending_mask[1])) { + mask = xchg_acquire(&vcpu->arch.irqs_pending_mask[1], 0); + val = READ_ONCE(vcpu->arch.irqs_pending[1]) & mask; + + csr->hviph &= ~mask; + csr->hviph |= val; + } +} + +void kvm_riscv_vcpu_aia_sync_interrupts(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + + if (kvm_riscv_aia_available()) + csr->vsieh = ncsr_read(CSR_VSIEH); +} +#endif + +bool kvm_riscv_vcpu_aia_has_interrupts(struct kvm_vcpu *vcpu, u64 mask) +{ + unsigned long seip; + + if (!kvm_riscv_aia_available()) + return false; + +#ifdef CONFIG_32BIT + if (READ_ONCE(vcpu->arch.irqs_pending[1]) & + (vcpu->arch.aia_context.guest_csr.vsieh & upper_32_bits(mask))) + return true; +#endif + + seip = vcpu->arch.guest_csr.vsie; + seip &= (unsigned long)mask; + seip &= BIT(IRQ_S_EXT); + + if (!kvm_riscv_aia_initialized(vcpu->kvm) || !seip) + return false; + + return kvm_riscv_vcpu_aia_imsic_has_interrupt(vcpu); +} + +void kvm_riscv_vcpu_aia_update_hvip(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + + if (!kvm_riscv_aia_available()) + return; + +#ifdef CONFIG_32BIT + ncsr_write(CSR_HVIPH, vcpu->arch.aia_context.guest_csr.hviph); +#endif + ncsr_write(CSR_HVICTL, aia_hvictl_value(!!(csr->hvip & BIT(IRQ_VS_EXT)))); +} + +void kvm_riscv_vcpu_aia_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + void *nsh; + + if (!kvm_riscv_aia_available()) + return; + + if (kvm_riscv_nacl_sync_csr_available()) { + nsh = nacl_shmem(); + nacl_csr_write(nsh, CSR_VSISELECT, csr->vsiselect); + nacl_csr_write(nsh, CSR_HVIPRIO1, csr->hviprio1); + nacl_csr_write(nsh, CSR_HVIPRIO2, csr->hviprio2); +#ifdef CONFIG_32BIT + nacl_csr_write(nsh, CSR_VSIEH, csr->vsieh); + nacl_csr_write(nsh, CSR_HVIPH, csr->hviph); + nacl_csr_write(nsh, CSR_HVIPRIO1H, csr->hviprio1h); + nacl_csr_write(nsh, CSR_HVIPRIO2H, csr->hviprio2h); +#endif + } else { + csr_write(CSR_VSISELECT, csr->vsiselect); + csr_write(CSR_HVIPRIO1, csr->hviprio1); + csr_write(CSR_HVIPRIO2, csr->hviprio2); +#ifdef CONFIG_32BIT + csr_write(CSR_VSIEH, csr->vsieh); + csr_write(CSR_HVIPH, csr->hviph); + csr_write(CSR_HVIPRIO1H, csr->hviprio1h); + csr_write(CSR_HVIPRIO2H, csr->hviprio2h); +#endif + } + + if (kvm_riscv_aia_initialized(vcpu->kvm)) + kvm_riscv_vcpu_aia_imsic_load(vcpu, cpu); +} + +void kvm_riscv_vcpu_aia_put(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + void *nsh; + + if (!kvm_riscv_aia_available()) + return; + + if (kvm_riscv_aia_initialized(vcpu->kvm)) + kvm_riscv_vcpu_aia_imsic_put(vcpu); + + if (kvm_riscv_nacl_available()) { + nsh = nacl_shmem(); + csr->vsiselect = nacl_csr_read(nsh, CSR_VSISELECT); + csr->hviprio1 = nacl_csr_read(nsh, CSR_HVIPRIO1); + csr->hviprio2 = nacl_csr_read(nsh, CSR_HVIPRIO2); +#ifdef CONFIG_32BIT + csr->vsieh = nacl_csr_read(nsh, CSR_VSIEH); + csr->hviph = nacl_csr_read(nsh, CSR_HVIPH); + csr->hviprio1h = nacl_csr_read(nsh, CSR_HVIPRIO1H); + csr->hviprio2h = nacl_csr_read(nsh, CSR_HVIPRIO2H); +#endif + } else { + csr->vsiselect = csr_read(CSR_VSISELECT); + csr->hviprio1 = csr_read(CSR_HVIPRIO1); + csr->hviprio2 = csr_read(CSR_HVIPRIO2); +#ifdef CONFIG_32BIT + csr->vsieh = csr_read(CSR_VSIEH); + csr->hviph = csr_read(CSR_HVIPH); + csr->hviprio1h = csr_read(CSR_HVIPRIO1H); + csr->hviprio2h = csr_read(CSR_HVIPRIO2H); +#endif + } +} + +int kvm_riscv_vcpu_aia_get_csr(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long *out_val) +{ + struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + + if (reg_num >= sizeof(struct kvm_riscv_aia_csr) / sizeof(unsigned long)) + return -ENOENT; + + *out_val = 0; + if (kvm_riscv_aia_available()) + *out_val = ((unsigned long *)csr)[reg_num]; + + return 0; +} + +int kvm_riscv_vcpu_aia_set_csr(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long val) +{ + struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + + if (reg_num >= sizeof(struct kvm_riscv_aia_csr) / sizeof(unsigned long)) + return -ENOENT; + + if (kvm_riscv_aia_available()) { + ((unsigned long *)csr)[reg_num] = val; + +#ifdef CONFIG_32BIT + if (reg_num == KVM_REG_RISCV_CSR_AIA_REG(siph)) + WRITE_ONCE(vcpu->arch.irqs_pending_mask[1], 0); +#endif + } + + return 0; +} + +int kvm_riscv_vcpu_aia_rmw_topei(struct kvm_vcpu *vcpu, + unsigned int csr_num, + unsigned long *val, + unsigned long new_val, + unsigned long wr_mask) +{ + /* If AIA not available then redirect trap */ + if (!kvm_riscv_aia_available()) + return KVM_INSN_ILLEGAL_TRAP; + + /* If AIA not initialized then forward to user space */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return KVM_INSN_EXIT_TO_USER_SPACE; + + return kvm_riscv_vcpu_aia_imsic_rmw(vcpu, KVM_RISCV_AIA_IMSIC_TOPEI, + val, new_val, wr_mask); +} + +/* + * External IRQ priority always read-only zero. This means default + * priority order is always preferred for external IRQs unless + * HVICTL.IID == 9 and HVICTL.IPRIO != 0 + */ +static int aia_irq2bitpos[] = { +0, 8, -1, -1, 16, 24, -1, -1, /* 0 - 7 */ +32, -1, -1, -1, -1, 40, 48, 56, /* 8 - 15 */ +64, 72, 80, 88, 96, 104, 112, 120, /* 16 - 23 */ +-1, -1, -1, -1, -1, -1, -1, -1, /* 24 - 31 */ +-1, -1, -1, -1, -1, -1, -1, -1, /* 32 - 39 */ +-1, -1, -1, -1, -1, -1, -1, -1, /* 40 - 47 */ +-1, -1, -1, -1, -1, -1, -1, -1, /* 48 - 55 */ +-1, -1, -1, -1, -1, -1, -1, -1, /* 56 - 63 */ +}; + +static u8 aia_get_iprio8(struct kvm_vcpu *vcpu, unsigned int irq) +{ + unsigned long hviprio; + int bitpos = aia_irq2bitpos[irq]; + + if (bitpos < 0) + return 0; + + switch (bitpos / BITS_PER_LONG) { + case 0: + hviprio = ncsr_read(CSR_HVIPRIO1); + break; + case 1: +#ifndef CONFIG_32BIT + hviprio = ncsr_read(CSR_HVIPRIO2); + break; +#else + hviprio = ncsr_read(CSR_HVIPRIO1H); + break; + case 2: + hviprio = ncsr_read(CSR_HVIPRIO2); + break; + case 3: + hviprio = ncsr_read(CSR_HVIPRIO2H); + break; +#endif + default: + return 0; + } + + return (hviprio >> (bitpos % BITS_PER_LONG)) & TOPI_IPRIO_MASK; +} + +static void aia_set_iprio8(struct kvm_vcpu *vcpu, unsigned int irq, u8 prio) +{ + unsigned long hviprio; + int bitpos = aia_irq2bitpos[irq]; + + if (bitpos < 0) + return; + + switch (bitpos / BITS_PER_LONG) { + case 0: + hviprio = ncsr_read(CSR_HVIPRIO1); + break; + case 1: +#ifndef CONFIG_32BIT + hviprio = ncsr_read(CSR_HVIPRIO2); + break; +#else + hviprio = ncsr_read(CSR_HVIPRIO1H); + break; + case 2: + hviprio = ncsr_read(CSR_HVIPRIO2); + break; + case 3: + hviprio = ncsr_read(CSR_HVIPRIO2H); + break; +#endif + default: + return; + } + + hviprio &= ~(TOPI_IPRIO_MASK << (bitpos % BITS_PER_LONG)); + hviprio |= (unsigned long)prio << (bitpos % BITS_PER_LONG); + + switch (bitpos / BITS_PER_LONG) { + case 0: + ncsr_write(CSR_HVIPRIO1, hviprio); + break; + case 1: +#ifndef CONFIG_32BIT + ncsr_write(CSR_HVIPRIO2, hviprio); + break; +#else + ncsr_write(CSR_HVIPRIO1H, hviprio); + break; + case 2: + ncsr_write(CSR_HVIPRIO2, hviprio); + break; + case 3: + ncsr_write(CSR_HVIPRIO2H, hviprio); + break; +#endif + default: + return; + } +} + +static int aia_rmw_iprio(struct kvm_vcpu *vcpu, unsigned int isel, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + int i, first_irq, nirqs; + unsigned long old_val; + u8 prio; + +#ifndef CONFIG_32BIT + if (isel & 0x1) + return KVM_INSN_ILLEGAL_TRAP; +#endif + + nirqs = 4 * (BITS_PER_LONG / 32); + first_irq = (isel - ISELECT_IPRIO0) * 4; + + old_val = 0; + for (i = 0; i < nirqs; i++) { + prio = aia_get_iprio8(vcpu, first_irq + i); + old_val |= (unsigned long)prio << (TOPI_IPRIO_BITS * i); + } + + if (val) + *val = old_val; + + if (wr_mask) { + new_val = (old_val & ~wr_mask) | (new_val & wr_mask); + for (i = 0; i < nirqs; i++) { + prio = (new_val >> (TOPI_IPRIO_BITS * i)) & + TOPI_IPRIO_MASK; + aia_set_iprio8(vcpu, first_irq + i, prio); + } + } + + return KVM_INSN_CONTINUE_NEXT_SEPC; +} + +int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + unsigned int isel; + + /* If AIA not available then redirect trap */ + if (!kvm_riscv_aia_available()) + return KVM_INSN_ILLEGAL_TRAP; + + /* First try to emulate in kernel space */ + isel = ncsr_read(CSR_VSISELECT) & ISELECT_MASK; + if (isel >= ISELECT_IPRIO0 && isel <= ISELECT_IPRIO15) + return aia_rmw_iprio(vcpu, isel, val, new_val, wr_mask); + else if (isel >= IMSIC_FIRST && isel <= IMSIC_LAST && + kvm_riscv_aia_initialized(vcpu->kvm)) + return kvm_riscv_vcpu_aia_imsic_rmw(vcpu, isel, val, new_val, + wr_mask); + + /* We can't handle it here so redirect to user space */ + return KVM_INSN_EXIT_TO_USER_SPACE; +} + +int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner, + void __iomem **hgei_va, phys_addr_t *hgei_pa) +{ + int ret = -ENOENT; + unsigned long flags; + const struct imsic_global_config *gc; + const struct imsic_local_config *lc; + struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu); + + if (!kvm_riscv_aia_available() || !hgctrl) + return -ENODEV; + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + + if (hgctrl->free_bitmap) { + ret = __ffs(hgctrl->free_bitmap); + hgctrl->free_bitmap &= ~BIT(ret); + hgctrl->owners[ret] = owner; + } + + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); + + gc = imsic_get_global_config(); + lc = (gc) ? per_cpu_ptr(gc->local, cpu) : NULL; + if (lc && ret > 0) { + if (hgei_va) + *hgei_va = lc->msi_va + (ret * IMSIC_MMIO_PAGE_SZ); + if (hgei_pa) + *hgei_pa = lc->msi_pa + (ret * IMSIC_MMIO_PAGE_SZ); + } + + return ret; +} + +void kvm_riscv_aia_free_hgei(int cpu, int hgei) +{ + unsigned long flags; + struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu); + + if (!kvm_riscv_aia_available() || !hgctrl) + return; + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + + if (hgei > 0 && hgei <= kvm_riscv_aia_nr_hgei) { + if (!(hgctrl->free_bitmap & BIT(hgei))) { + hgctrl->free_bitmap |= BIT(hgei); + hgctrl->owners[hgei] = NULL; + } + } + + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); +} + +static irqreturn_t hgei_interrupt(int irq, void *dev_id) +{ + int i; + unsigned long hgei_mask, flags; + struct aia_hgei_control *hgctrl = get_cpu_ptr(&aia_hgei); + + hgei_mask = csr_read(CSR_HGEIP) & csr_read(CSR_HGEIE); + csr_clear(CSR_HGEIE, hgei_mask); + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + + for_each_set_bit(i, &hgei_mask, BITS_PER_LONG) { + if (hgctrl->owners[i]) + kvm_vcpu_kick(hgctrl->owners[i]); + } + + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); + + put_cpu_ptr(&aia_hgei); + return IRQ_HANDLED; +} + +static int aia_hgei_init(void) +{ + int cpu, rc; + struct irq_domain *domain; + struct aia_hgei_control *hgctrl; + + /* Initialize per-CPU guest external interrupt line management */ + for_each_possible_cpu(cpu) { + hgctrl = per_cpu_ptr(&aia_hgei, cpu); + raw_spin_lock_init(&hgctrl->lock); + if (kvm_riscv_aia_nr_hgei) { + hgctrl->free_bitmap = + BIT(kvm_riscv_aia_nr_hgei + 1) - 1; + hgctrl->free_bitmap &= ~BIT(0); + } else + hgctrl->free_bitmap = 0; + } + + /* Skip SGEI interrupt setup for zero guest external interrupts */ + if (!kvm_riscv_aia_nr_hgei) + goto skip_sgei_interrupt; + + /* Find INTC irq domain */ + domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), + DOMAIN_BUS_ANY); + if (!domain) { + kvm_err("unable to find INTC domain\n"); + return -ENOENT; + } + + /* Map per-CPU SGEI interrupt from INTC domain */ + hgei_parent_irq = irq_create_mapping(domain, IRQ_S_GEXT); + if (!hgei_parent_irq) { + kvm_err("unable to map SGEI IRQ\n"); + return -ENOMEM; + } + + /* Request per-CPU SGEI interrupt */ + rc = request_percpu_irq(hgei_parent_irq, hgei_interrupt, + "riscv-kvm", &aia_hgei); + if (rc) { + kvm_err("failed to request SGEI IRQ\n"); + return rc; + } + +skip_sgei_interrupt: + return 0; +} + +static void aia_hgei_exit(void) +{ + /* Do nothing for zero guest external interrupts */ + if (!kvm_riscv_aia_nr_hgei) + return; + + /* Free per-CPU SGEI interrupt */ + free_percpu_irq(hgei_parent_irq, &aia_hgei); +} + +void kvm_riscv_aia_enable(void) +{ + if (!kvm_riscv_aia_available()) + return; + + csr_write(CSR_HVICTL, aia_hvictl_value(false)); + csr_write(CSR_HVIPRIO1, 0x0); + csr_write(CSR_HVIPRIO2, 0x0); +#ifdef CONFIG_32BIT + csr_write(CSR_HVIPH, 0x0); + csr_write(CSR_HIDELEGH, 0x0); + csr_write(CSR_HVIPRIO1H, 0x0); + csr_write(CSR_HVIPRIO2H, 0x0); +#endif + + /* Enable per-CPU SGEI interrupt */ + enable_percpu_irq(hgei_parent_irq, + irq_get_trigger_type(hgei_parent_irq)); + csr_set(CSR_HIE, BIT(IRQ_S_GEXT)); + /* Enable IRQ filtering for overflow interrupt only if sscofpmf is present */ + if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSCOFPMF)) + csr_set(CSR_HVIEN, BIT(IRQ_PMU_OVF)); +} + +void kvm_riscv_aia_disable(void) +{ + int i; + unsigned long flags; + struct kvm_vcpu *vcpu; + struct aia_hgei_control *hgctrl; + + if (!kvm_riscv_aia_available()) + return; + hgctrl = get_cpu_ptr(&aia_hgei); + + if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSCOFPMF)) + csr_clear(CSR_HVIEN, BIT(IRQ_PMU_OVF)); + /* Disable per-CPU SGEI interrupt */ + csr_clear(CSR_HIE, BIT(IRQ_S_GEXT)); + disable_percpu_irq(hgei_parent_irq); + + csr_write(CSR_HVICTL, aia_hvictl_value(false)); + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + + for (i = 0; i <= kvm_riscv_aia_nr_hgei; i++) { + vcpu = hgctrl->owners[i]; + if (!vcpu) + continue; + + /* + * We release hgctrl->lock before notifying IMSIC + * so that we don't have lock ordering issues. + */ + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); + + /* Notify IMSIC */ + kvm_riscv_vcpu_aia_imsic_release(vcpu); + + /* + * Wakeup VCPU if it was blocked so that it can + * run on other HARTs + */ + if (csr_read(CSR_HGEIE) & BIT(i)) { + csr_clear(CSR_HGEIE, BIT(i)); + kvm_vcpu_kick(vcpu); + } + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + } + + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); + + put_cpu_ptr(&aia_hgei); +} + +int kvm_riscv_aia_init(void) +{ + int rc; + const struct imsic_global_config *gc; + + if (!riscv_isa_extension_available(NULL, SxAIA)) + return -ENODEV; + gc = imsic_get_global_config(); + + /* Figure-out number of bits in HGEIE */ + csr_write(CSR_HGEIE, -1UL); + kvm_riscv_aia_nr_hgei = fls_long(csr_read(CSR_HGEIE)); + csr_write(CSR_HGEIE, 0); + if (kvm_riscv_aia_nr_hgei) + kvm_riscv_aia_nr_hgei--; + + /* + * Number of usable HGEI lines should be minimum of per-HART + * IMSIC guest files and number of bits in HGEIE + */ + if (gc) + kvm_riscv_aia_nr_hgei = min((ulong)kvm_riscv_aia_nr_hgei, + BIT(gc->guest_index_bits) - 1); + else + kvm_riscv_aia_nr_hgei = 0; + + /* Find number of guest MSI IDs */ + kvm_riscv_aia_max_ids = IMSIC_MAX_ID; + if (gc && kvm_riscv_aia_nr_hgei) + kvm_riscv_aia_max_ids = gc->nr_guest_ids + 1; + + /* Initialize guest external interrupt line management */ + rc = aia_hgei_init(); + if (rc) + return rc; + + /* Register device operations */ + rc = kvm_register_device_ops(&kvm_riscv_aia_device_ops, + KVM_DEV_TYPE_RISCV_AIA); + if (rc) { + aia_hgei_exit(); + return rc; + } + + /* Enable KVM AIA support */ + static_branch_enable(&kvm_riscv_aia_available); + + return 0; +} + +void kvm_riscv_aia_exit(void) +{ + if (!kvm_riscv_aia_available()) + return; + + /* Unregister device operations */ + kvm_unregister_device_ops(KVM_DEV_TYPE_RISCV_AIA); + + /* Cleanup the HGEI state */ + aia_hgei_exit(); +} diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c new file mode 100644 index 000000000000..f59d1c0c8c43 --- /dev/null +++ b/arch/riscv/kvm/aia_aplic.c @@ -0,0 +1,645 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + * + * Authors: + * Anup Patel <apatel@ventanamicro.com> + */ + +#include <linux/irqchip/riscv-aplic.h> +#include <linux/kvm_host.h> +#include <linux/math.h> +#include <linux/spinlock.h> +#include <linux/swab.h> +#include <kvm/iodev.h> + +struct aplic_irq { + raw_spinlock_t lock; + u32 sourcecfg; + u32 state; +#define APLIC_IRQ_STATE_PENDING BIT(0) +#define APLIC_IRQ_STATE_ENABLED BIT(1) +#define APLIC_IRQ_STATE_ENPEND (APLIC_IRQ_STATE_PENDING | \ + APLIC_IRQ_STATE_ENABLED) +#define APLIC_IRQ_STATE_INPUT BIT(8) + u32 target; +}; + +struct aplic { + struct kvm_io_device iodev; + + u32 domaincfg; + u32 genmsi; + + u32 nr_irqs; + u32 nr_words; + struct aplic_irq *irqs; +}; + +static u32 aplic_read_sourcecfg(struct aplic *aplic, u32 irq) +{ + u32 ret; + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return 0; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + ret = irqd->sourcecfg; + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_write_sourcecfg(struct aplic *aplic, u32 irq, u32 val) +{ + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return; + irqd = &aplic->irqs[irq]; + + if (val & APLIC_SOURCECFG_D) + val = 0; + else + val &= APLIC_SOURCECFG_SM_MASK; + + raw_spin_lock_irqsave(&irqd->lock, flags); + irqd->sourcecfg = val; + raw_spin_unlock_irqrestore(&irqd->lock, flags); +} + +static u32 aplic_read_target(struct aplic *aplic, u32 irq) +{ + u32 ret; + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return 0; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + ret = irqd->target; + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_write_target(struct aplic *aplic, u32 irq, u32 val) +{ + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return; + irqd = &aplic->irqs[irq]; + + val &= APLIC_TARGET_EIID_MASK | + (APLIC_TARGET_HART_IDX_MASK << APLIC_TARGET_HART_IDX_SHIFT) | + (APLIC_TARGET_GUEST_IDX_MASK << APLIC_TARGET_GUEST_IDX_SHIFT); + + raw_spin_lock_irqsave(&irqd->lock, flags); + irqd->target = val; + raw_spin_unlock_irqrestore(&irqd->lock, flags); +} + +static bool aplic_read_pending(struct aplic *aplic, u32 irq) +{ + bool ret; + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return false; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + ret = (irqd->state & APLIC_IRQ_STATE_PENDING) ? true : false; + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_write_pending(struct aplic *aplic, u32 irq, bool pending) +{ + unsigned long flags, sm; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + + sm = irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK; + if (sm == APLIC_SOURCECFG_SM_INACTIVE) + goto skip_write_pending; + + if (sm == APLIC_SOURCECFG_SM_LEVEL_HIGH || + sm == APLIC_SOURCECFG_SM_LEVEL_LOW) { + if (!pending) + goto noskip_write_pending; + if ((irqd->state & APLIC_IRQ_STATE_INPUT) && + sm == APLIC_SOURCECFG_SM_LEVEL_LOW) + goto skip_write_pending; + if (!(irqd->state & APLIC_IRQ_STATE_INPUT) && + sm == APLIC_SOURCECFG_SM_LEVEL_HIGH) + goto skip_write_pending; + } + +noskip_write_pending: + if (pending) + irqd->state |= APLIC_IRQ_STATE_PENDING; + else + irqd->state &= ~APLIC_IRQ_STATE_PENDING; + +skip_write_pending: + raw_spin_unlock_irqrestore(&irqd->lock, flags); +} + +static bool aplic_read_enabled(struct aplic *aplic, u32 irq) +{ + bool ret; + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return false; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + ret = (irqd->state & APLIC_IRQ_STATE_ENABLED) ? true : false; + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_write_enabled(struct aplic *aplic, u32 irq, bool enabled) +{ + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + if (enabled) + irqd->state |= APLIC_IRQ_STATE_ENABLED; + else + irqd->state &= ~APLIC_IRQ_STATE_ENABLED; + raw_spin_unlock_irqrestore(&irqd->lock, flags); +} + +static bool aplic_read_input(struct aplic *aplic, u32 irq) +{ + u32 sourcecfg, sm, raw_input, irq_inverted; + struct aplic_irq *irqd; + unsigned long flags; + bool ret = false; + + if (!irq || aplic->nr_irqs <= irq) + return false; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + + sourcecfg = irqd->sourcecfg; + if (sourcecfg & APLIC_SOURCECFG_D) + goto skip; + + sm = sourcecfg & APLIC_SOURCECFG_SM_MASK; + if (sm == APLIC_SOURCECFG_SM_INACTIVE) + goto skip; + + raw_input = (irqd->state & APLIC_IRQ_STATE_INPUT) ? 1 : 0; + irq_inverted = (sm == APLIC_SOURCECFG_SM_LEVEL_LOW || + sm == APLIC_SOURCECFG_SM_EDGE_FALL) ? 1 : 0; + ret = !!(raw_input ^ irq_inverted); + +skip: + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_inject_msi(struct kvm *kvm, u32 irq, u32 target) +{ + u32 hart_idx, guest_idx, eiid; + + hart_idx = target >> APLIC_TARGET_HART_IDX_SHIFT; + hart_idx &= APLIC_TARGET_HART_IDX_MASK; + guest_idx = target >> APLIC_TARGET_GUEST_IDX_SHIFT; + guest_idx &= APLIC_TARGET_GUEST_IDX_MASK; + eiid = target & APLIC_TARGET_EIID_MASK; + kvm_riscv_aia_inject_msi_by_id(kvm, hart_idx, guest_idx, eiid); +} + +static void aplic_update_irq_range(struct kvm *kvm, u32 first, u32 last) +{ + bool inject; + u32 irq, target; + unsigned long flags; + struct aplic_irq *irqd; + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if (!(aplic->domaincfg & APLIC_DOMAINCFG_IE)) + return; + + for (irq = first; irq <= last; irq++) { + if (!irq || aplic->nr_irqs <= irq) + continue; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + + inject = false; + target = irqd->target; + if ((irqd->state & APLIC_IRQ_STATE_ENPEND) == + APLIC_IRQ_STATE_ENPEND) { + irqd->state &= ~APLIC_IRQ_STATE_PENDING; + inject = true; + } + + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + if (inject) + aplic_inject_msi(kvm, irq, target); + } +} + +int kvm_riscv_aia_aplic_inject(struct kvm *kvm, u32 source, bool level) +{ + u32 target; + bool inject = false, ie; + unsigned long flags; + struct aplic_irq *irqd; + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if (!aplic || !source || (aplic->nr_irqs <= source)) + return -ENODEV; + irqd = &aplic->irqs[source]; + ie = (aplic->domaincfg & APLIC_DOMAINCFG_IE) ? true : false; + + raw_spin_lock_irqsave(&irqd->lock, flags); + + if (irqd->sourcecfg & APLIC_SOURCECFG_D) + goto skip_unlock; + + switch (irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK) { + case APLIC_SOURCECFG_SM_EDGE_RISE: + if (level && !(irqd->state & APLIC_IRQ_STATE_INPUT) && + !(irqd->state & APLIC_IRQ_STATE_PENDING)) + irqd->state |= APLIC_IRQ_STATE_PENDING; + break; + case APLIC_SOURCECFG_SM_EDGE_FALL: + if (!level && (irqd->state & APLIC_IRQ_STATE_INPUT) && + !(irqd->state & APLIC_IRQ_STATE_PENDING)) + irqd->state |= APLIC_IRQ_STATE_PENDING; + break; + case APLIC_SOURCECFG_SM_LEVEL_HIGH: + if (level && !(irqd->state & APLIC_IRQ_STATE_PENDING)) + irqd->state |= APLIC_IRQ_STATE_PENDING; + break; + case APLIC_SOURCECFG_SM_LEVEL_LOW: + if (!level && !(irqd->state & APLIC_IRQ_STATE_PENDING)) + irqd->state |= APLIC_IRQ_STATE_PENDING; + break; + } + + if (level) + irqd->state |= APLIC_IRQ_STATE_INPUT; + else + irqd->state &= ~APLIC_IRQ_STATE_INPUT; + + target = irqd->target; + if (ie && ((irqd->state & APLIC_IRQ_STATE_ENPEND) == + APLIC_IRQ_STATE_ENPEND)) { + irqd->state &= ~APLIC_IRQ_STATE_PENDING; + inject = true; + } + +skip_unlock: + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + if (inject) + aplic_inject_msi(kvm, source, target); + + return 0; +} + +static u32 aplic_read_input_word(struct aplic *aplic, u32 word) +{ + u32 i, ret = 0; + + for (i = 0; i < 32; i++) + ret |= aplic_read_input(aplic, word * 32 + i) ? BIT(i) : 0; + + return ret; +} + +static u32 aplic_read_pending_word(struct aplic *aplic, u32 word) +{ + u32 i, ret = 0; + + for (i = 0; i < 32; i++) + ret |= aplic_read_pending(aplic, word * 32 + i) ? BIT(i) : 0; + + return ret; +} + +static void aplic_write_pending_word(struct aplic *aplic, u32 word, + u32 val, bool pending) +{ + u32 i; + + for (i = 0; i < 32; i++) { + if (val & BIT(i)) + aplic_write_pending(aplic, word * 32 + i, pending); + } +} + +static u32 aplic_read_enabled_word(struct aplic *aplic, u32 word) +{ + u32 i, ret = 0; + + for (i = 0; i < 32; i++) + ret |= aplic_read_enabled(aplic, word * 32 + i) ? BIT(i) : 0; + + return ret; +} + +static void aplic_write_enabled_word(struct aplic *aplic, u32 word, + u32 val, bool enabled) +{ + u32 i; + + for (i = 0; i < 32; i++) { + if (val & BIT(i)) + aplic_write_enabled(aplic, word * 32 + i, enabled); + } +} + +static int aplic_mmio_read_offset(struct kvm *kvm, gpa_t off, u32 *val32) +{ + u32 i; + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if ((off & 0x3) != 0) + return -EOPNOTSUPP; + + if (off == APLIC_DOMAINCFG) { + *val32 = APLIC_DOMAINCFG_RDONLY | + aplic->domaincfg | APLIC_DOMAINCFG_DM; + } else if ((off >= APLIC_SOURCECFG_BASE) && + (off < (APLIC_SOURCECFG_BASE + (aplic->nr_irqs - 1) * 4))) { + i = ((off - APLIC_SOURCECFG_BASE) >> 2) + 1; + *val32 = aplic_read_sourcecfg(aplic, i); + } else if ((off >= APLIC_SETIP_BASE) && + (off < (APLIC_SETIP_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_SETIP_BASE) >> 2; + *val32 = aplic_read_pending_word(aplic, i); + } else if (off == APLIC_SETIPNUM) { + *val32 = 0; + } else if ((off >= APLIC_CLRIP_BASE) && + (off < (APLIC_CLRIP_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_CLRIP_BASE) >> 2; + *val32 = aplic_read_input_word(aplic, i); + } else if (off == APLIC_CLRIPNUM) { + *val32 = 0; + } else if ((off >= APLIC_SETIE_BASE) && + (off < (APLIC_SETIE_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_SETIE_BASE) >> 2; + *val32 = aplic_read_enabled_word(aplic, i); + } else if (off == APLIC_SETIENUM) { + *val32 = 0; + } else if ((off >= APLIC_CLRIE_BASE) && + (off < (APLIC_CLRIE_BASE + aplic->nr_words * 4))) { + *val32 = 0; + } else if (off == APLIC_CLRIENUM) { + *val32 = 0; + } else if (off == APLIC_SETIPNUM_LE) { + *val32 = 0; + } else if (off == APLIC_SETIPNUM_BE) { + *val32 = 0; + } else if (off == APLIC_GENMSI) { + *val32 = aplic->genmsi; + } else if ((off >= APLIC_TARGET_BASE) && + (off < (APLIC_TARGET_BASE + (aplic->nr_irqs - 1) * 4))) { + i = ((off - APLIC_TARGET_BASE) >> 2) + 1; + *val32 = aplic_read_target(aplic, i); + } else + return -ENODEV; + + return 0; +} + +static int aplic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + if (len != 4) + return -EOPNOTSUPP; + + return aplic_mmio_read_offset(vcpu->kvm, + addr - vcpu->kvm->arch.aia.aplic_addr, + val); +} + +static int aplic_mmio_write_offset(struct kvm *kvm, gpa_t off, u32 val32) +{ + u32 i; + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if ((off & 0x3) != 0) + return -EOPNOTSUPP; + + if (off == APLIC_DOMAINCFG) { + /* Only IE bit writeable */ + aplic->domaincfg = val32 & APLIC_DOMAINCFG_IE; + } else if ((off >= APLIC_SOURCECFG_BASE) && + (off < (APLIC_SOURCECFG_BASE + (aplic->nr_irqs - 1) * 4))) { + i = ((off - APLIC_SOURCECFG_BASE) >> 2) + 1; + aplic_write_sourcecfg(aplic, i, val32); + } else if ((off >= APLIC_SETIP_BASE) && + (off < (APLIC_SETIP_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_SETIP_BASE) >> 2; + aplic_write_pending_word(aplic, i, val32, true); + } else if (off == APLIC_SETIPNUM) { + aplic_write_pending(aplic, val32, true); + } else if ((off >= APLIC_CLRIP_BASE) && + (off < (APLIC_CLRIP_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_CLRIP_BASE) >> 2; + aplic_write_pending_word(aplic, i, val32, false); + } else if (off == APLIC_CLRIPNUM) { + aplic_write_pending(aplic, val32, false); + } else if ((off >= APLIC_SETIE_BASE) && + (off < (APLIC_SETIE_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_SETIE_BASE) >> 2; + aplic_write_enabled_word(aplic, i, val32, true); + } else if (off == APLIC_SETIENUM) { + aplic_write_enabled(aplic, val32, true); + } else if ((off >= APLIC_CLRIE_BASE) && + (off < (APLIC_CLRIE_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_CLRIE_BASE) >> 2; + aplic_write_enabled_word(aplic, i, val32, false); + } else if (off == APLIC_CLRIENUM) { + aplic_write_enabled(aplic, val32, false); + } else if (off == APLIC_SETIPNUM_LE) { + aplic_write_pending(aplic, val32, true); + } else if (off == APLIC_SETIPNUM_BE) { + aplic_write_pending(aplic, __swab32(val32), true); + } else if (off == APLIC_GENMSI) { + aplic->genmsi = val32 & ~(APLIC_TARGET_GUEST_IDX_MASK << + APLIC_TARGET_GUEST_IDX_SHIFT); + kvm_riscv_aia_inject_msi_by_id(kvm, + val32 >> APLIC_TARGET_HART_IDX_SHIFT, 0, + val32 & APLIC_TARGET_EIID_MASK); + } else if ((off >= APLIC_TARGET_BASE) && + (off < (APLIC_TARGET_BASE + (aplic->nr_irqs - 1) * 4))) { + i = ((off - APLIC_TARGET_BASE) >> 2) + 1; + aplic_write_target(aplic, i, val32); + } else + return -ENODEV; + + aplic_update_irq_range(kvm, 1, aplic->nr_irqs - 1); + + return 0; +} + +static int aplic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + if (len != 4) + return -EOPNOTSUPP; + + return aplic_mmio_write_offset(vcpu->kvm, + addr - vcpu->kvm->arch.aia.aplic_addr, + *((const u32 *)val)); +} + +static struct kvm_io_device_ops aplic_iodoev_ops = { + .read = aplic_mmio_read, + .write = aplic_mmio_write, +}; + +int kvm_riscv_aia_aplic_set_attr(struct kvm *kvm, unsigned long type, u32 v) +{ + int rc; + + if (!kvm->arch.aia.aplic_state) + return -ENODEV; + + rc = aplic_mmio_write_offset(kvm, type, v); + if (rc) + return rc; + + return 0; +} + +int kvm_riscv_aia_aplic_get_attr(struct kvm *kvm, unsigned long type, u32 *v) +{ + int rc; + + if (!kvm->arch.aia.aplic_state) + return -ENODEV; + + rc = aplic_mmio_read_offset(kvm, type, v); + if (rc) + return rc; + + return 0; +} + +int kvm_riscv_aia_aplic_has_attr(struct kvm *kvm, unsigned long type) +{ + int rc; + u32 val; + + if (!kvm->arch.aia.aplic_state) + return -ENODEV; + + rc = aplic_mmio_read_offset(kvm, type, &val); + if (rc) + return rc; + + return 0; +} + +int kvm_riscv_aia_aplic_init(struct kvm *kvm) +{ + int i, ret = 0; + struct aplic *aplic; + + /* Do nothing if we have zero sources */ + if (!kvm->arch.aia.nr_sources) + return 0; + + /* Allocate APLIC global state */ + aplic = kzalloc(sizeof(*aplic), GFP_KERNEL); + if (!aplic) + return -ENOMEM; + kvm->arch.aia.aplic_state = aplic; + + /* Setup APLIC IRQs */ + aplic->nr_irqs = kvm->arch.aia.nr_sources + 1; + aplic->nr_words = DIV_ROUND_UP(aplic->nr_irqs, 32); + aplic->irqs = kcalloc(aplic->nr_irqs, + sizeof(*aplic->irqs), GFP_KERNEL); + if (!aplic->irqs) { + ret = -ENOMEM; + goto fail_free_aplic; + } + for (i = 0; i < aplic->nr_irqs; i++) + raw_spin_lock_init(&aplic->irqs[i].lock); + + /* Setup IO device */ + kvm_iodevice_init(&aplic->iodev, &aplic_iodoev_ops); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, + kvm->arch.aia.aplic_addr, + KVM_DEV_RISCV_APLIC_SIZE, + &aplic->iodev); + mutex_unlock(&kvm->slots_lock); + if (ret) + goto fail_free_aplic_irqs; + + /* Setup default IRQ routing */ + ret = kvm_riscv_setup_default_irq_routing(kvm, aplic->nr_irqs); + if (ret) + goto fail_unreg_iodev; + + return 0; + +fail_unreg_iodev: + mutex_lock(&kvm->slots_lock); + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &aplic->iodev); + mutex_unlock(&kvm->slots_lock); +fail_free_aplic_irqs: + kfree(aplic->irqs); +fail_free_aplic: + kvm->arch.aia.aplic_state = NULL; + kfree(aplic); + return ret; +} + +void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm) +{ + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if (!aplic) + return; + + mutex_lock(&kvm->slots_lock); + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &aplic->iodev); + mutex_unlock(&kvm->slots_lock); + + kfree(aplic->irqs); + + kvm->arch.aia.aplic_state = NULL; + kfree(aplic); +} diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c new file mode 100644 index 000000000000..b195a93add1c --- /dev/null +++ b/arch/riscv/kvm/aia_device.c @@ -0,0 +1,640 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + * + * Authors: + * Anup Patel <apatel@ventanamicro.com> + */ + +#include <linux/bits.h> +#include <linux/irqchip/riscv-imsic.h> +#include <linux/kvm_host.h> +#include <linux/uaccess.h> + +static int aia_create(struct kvm_device *dev, u32 type) +{ + int ret; + unsigned long i; + struct kvm *kvm = dev->kvm; + struct kvm_vcpu *vcpu; + + if (irqchip_in_kernel(kvm)) + return -EEXIST; + + ret = -EBUSY; + if (kvm_trylock_all_vcpus(kvm)) + return ret; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.ran_atleast_once) + goto out_unlock; + } + ret = 0; + + kvm->arch.aia.in_kernel = true; + +out_unlock: + kvm_unlock_all_vcpus(kvm); + return ret; +} + +static void aia_destroy(struct kvm_device *dev) +{ + kfree(dev); +} + +static int aia_config(struct kvm *kvm, unsigned long type, + u32 *nr, bool write) +{ + struct kvm_aia *aia = &kvm->arch.aia; + + /* Writes can only be done before irqchip is initialized */ + if (write && kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + switch (type) { + case KVM_DEV_RISCV_AIA_CONFIG_MODE: + if (write) { + switch (*nr) { + case KVM_DEV_RISCV_AIA_MODE_EMUL: + break; + case KVM_DEV_RISCV_AIA_MODE_HWACCEL: + case KVM_DEV_RISCV_AIA_MODE_AUTO: + /* + * HW Acceleration and Auto modes only + * supported on host with non-zero guest + * external interrupts (i.e. non-zero + * VS-level IMSIC pages). + */ + if (!kvm_riscv_aia_nr_hgei) + return -EINVAL; + break; + default: + return -EINVAL; + } + aia->mode = *nr; + } else + *nr = aia->mode; + break; + case KVM_DEV_RISCV_AIA_CONFIG_IDS: + if (write) { + if ((*nr < KVM_DEV_RISCV_AIA_IDS_MIN) || + (*nr >= KVM_DEV_RISCV_AIA_IDS_MAX) || + ((*nr & KVM_DEV_RISCV_AIA_IDS_MIN) != + KVM_DEV_RISCV_AIA_IDS_MIN) || + (kvm_riscv_aia_max_ids <= *nr)) + return -EINVAL; + aia->nr_ids = *nr; + } else + *nr = aia->nr_ids; + break; + case KVM_DEV_RISCV_AIA_CONFIG_SRCS: + if (write) { + if ((*nr >= KVM_DEV_RISCV_AIA_SRCS_MAX) || + (*nr >= kvm_riscv_aia_max_ids)) + return -EINVAL; + aia->nr_sources = *nr; + } else + *nr = aia->nr_sources; + break; + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS: + if (write) { + if (*nr >= KVM_DEV_RISCV_AIA_GROUP_BITS_MAX) + return -EINVAL; + aia->nr_group_bits = *nr; + } else + *nr = aia->nr_group_bits; + break; + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT: + if (write) { + if ((*nr < KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN) || + (*nr >= KVM_DEV_RISCV_AIA_GROUP_SHIFT_MAX)) + return -EINVAL; + aia->nr_group_shift = *nr; + } else + *nr = aia->nr_group_shift; + break; + case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS: + if (write) { + if (*nr >= KVM_DEV_RISCV_AIA_HART_BITS_MAX) + return -EINVAL; + aia->nr_hart_bits = *nr; + } else + *nr = aia->nr_hart_bits; + break; + case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS: + if (write) { + if (*nr >= KVM_DEV_RISCV_AIA_GUEST_BITS_MAX) + return -EINVAL; + aia->nr_guest_bits = *nr; + } else + *nr = aia->nr_guest_bits; + break; + default: + return -ENXIO; + } + + return 0; +} + +static int aia_aplic_addr(struct kvm *kvm, u64 *addr, bool write) +{ + struct kvm_aia *aia = &kvm->arch.aia; + + if (write) { + /* Writes can only be done before irqchip is initialized */ + if (kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + if (*addr & (KVM_DEV_RISCV_APLIC_ALIGN - 1)) + return -EINVAL; + + aia->aplic_addr = *addr; + } else + *addr = aia->aplic_addr; + + return 0; +} + +static int aia_imsic_addr(struct kvm *kvm, u64 *addr, + unsigned long vcpu_idx, bool write) +{ + struct kvm_vcpu *vcpu; + struct kvm_vcpu_aia *vcpu_aia; + + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (!vcpu) + return -EINVAL; + vcpu_aia = &vcpu->arch.aia_context; + + if (write) { + /* Writes can only be done before irqchip is initialized */ + if (kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + if (*addr & (KVM_DEV_RISCV_IMSIC_ALIGN - 1)) + return -EINVAL; + } + + mutex_lock(&vcpu->mutex); + if (write) + vcpu_aia->imsic_addr = *addr; + else + *addr = vcpu_aia->imsic_addr; + mutex_unlock(&vcpu->mutex); + + return 0; +} + +static gpa_t aia_imsic_ppn(struct kvm_aia *aia, gpa_t addr) +{ + u32 h, l; + gpa_t mask = 0; + + h = aia->nr_hart_bits + aia->nr_guest_bits + + IMSIC_MMIO_PAGE_SHIFT - 1; + mask = GENMASK_ULL(h, 0); + + if (aia->nr_group_bits) { + h = aia->nr_group_bits + aia->nr_group_shift - 1; + l = aia->nr_group_shift; + mask |= GENMASK_ULL(h, l); + } + + return (addr & ~mask) >> IMSIC_MMIO_PAGE_SHIFT; +} + +static u32 aia_imsic_hart_index(struct kvm_aia *aia, gpa_t addr) +{ + u32 hart = 0, group = 0; + + if (aia->nr_hart_bits) + hart = (addr >> (aia->nr_guest_bits + IMSIC_MMIO_PAGE_SHIFT)) & + GENMASK_ULL(aia->nr_hart_bits - 1, 0); + if (aia->nr_group_bits) + group = (addr >> aia->nr_group_shift) & + GENMASK_ULL(aia->nr_group_bits - 1, 0); + + return (group << aia->nr_hart_bits) | hart; +} + +static int aia_init(struct kvm *kvm) +{ + int ret, i; + unsigned long idx; + struct kvm_vcpu *vcpu; + struct kvm_vcpu_aia *vaia; + struct kvm_aia *aia = &kvm->arch.aia; + gpa_t base_ppn = KVM_RISCV_AIA_UNDEF_ADDR; + + /* Irqchip can be initialized only once */ + if (kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* We might be in the middle of creating a VCPU? */ + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) + return -EBUSY; + + /* Number of sources should be less than or equals number of IDs */ + if (aia->nr_ids < aia->nr_sources) + return -EINVAL; + + /* APLIC base is required for non-zero number of sources */ + if (aia->nr_sources && aia->aplic_addr == KVM_RISCV_AIA_UNDEF_ADDR) + return -EINVAL; + + /* Initialize APLIC */ + ret = kvm_riscv_aia_aplic_init(kvm); + if (ret) + return ret; + + /* Iterate over each VCPU */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + vaia = &vcpu->arch.aia_context; + + /* IMSIC base is required */ + if (vaia->imsic_addr == KVM_RISCV_AIA_UNDEF_ADDR) { + ret = -EINVAL; + goto fail_cleanup_imsics; + } + + /* All IMSICs should have matching base PPN */ + if (base_ppn == KVM_RISCV_AIA_UNDEF_ADDR) + base_ppn = aia_imsic_ppn(aia, vaia->imsic_addr); + if (base_ppn != aia_imsic_ppn(aia, vaia->imsic_addr)) { + ret = -EINVAL; + goto fail_cleanup_imsics; + } + + /* Update HART index of the IMSIC based on IMSIC base */ + vaia->hart_index = aia_imsic_hart_index(aia, + vaia->imsic_addr); + + /* Initialize IMSIC for this VCPU */ + ret = kvm_riscv_vcpu_aia_imsic_init(vcpu); + if (ret) + goto fail_cleanup_imsics; + } + + /* Set the initialized flag */ + kvm->arch.aia.initialized = true; + + return 0; + +fail_cleanup_imsics: + for (i = idx - 1; i >= 0; i--) { + vcpu = kvm_get_vcpu(kvm, i); + if (!vcpu) + continue; + kvm_riscv_vcpu_aia_imsic_cleanup(vcpu); + } + kvm_riscv_aia_aplic_cleanup(kvm); + return ret; +} + +static int aia_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + u32 nr; + u64 addr; + int nr_vcpus, r = -ENXIO; + unsigned long v, type = (unsigned long)attr->attr; + void __user *uaddr = (void __user *)(long)attr->addr; + + switch (attr->group) { + case KVM_DEV_RISCV_AIA_GRP_CONFIG: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = aia_config(dev->kvm, type, &nr, true); + mutex_unlock(&dev->kvm->lock); + + break; + + case KVM_DEV_RISCV_AIA_GRP_ADDR: + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + nr_vcpus = atomic_read(&dev->kvm->online_vcpus); + mutex_lock(&dev->kvm->lock); + if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC) + r = aia_aplic_addr(dev->kvm, &addr, true); + else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus)) + r = aia_imsic_addr(dev->kvm, &addr, + type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), true); + mutex_unlock(&dev->kvm->lock); + + break; + + case KVM_DEV_RISCV_AIA_GRP_CTRL: + switch (type) { + case KVM_DEV_RISCV_AIA_CTRL_INIT: + mutex_lock(&dev->kvm->lock); + r = aia_init(dev->kvm); + mutex_unlock(&dev->kvm->lock); + break; + } + + break; + case KVM_DEV_RISCV_AIA_GRP_APLIC: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = kvm_riscv_aia_aplic_set_attr(dev->kvm, type, nr); + mutex_unlock(&dev->kvm->lock); + + break; + case KVM_DEV_RISCV_AIA_GRP_IMSIC: + if (copy_from_user(&v, uaddr, sizeof(v))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = kvm_riscv_aia_imsic_rw_attr(dev->kvm, type, true, &v); + mutex_unlock(&dev->kvm->lock); + + break; + } + + return r; +} + +static int aia_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + u32 nr; + u64 addr; + int nr_vcpus, r = -ENXIO; + void __user *uaddr = (void __user *)(long)attr->addr; + unsigned long v, type = (unsigned long)attr->attr; + + switch (attr->group) { + case KVM_DEV_RISCV_AIA_GRP_CONFIG: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = aia_config(dev->kvm, type, &nr, false); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &nr, sizeof(nr))) + return -EFAULT; + + break; + case KVM_DEV_RISCV_AIA_GRP_ADDR: + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + nr_vcpus = atomic_read(&dev->kvm->online_vcpus); + mutex_lock(&dev->kvm->lock); + if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC) + r = aia_aplic_addr(dev->kvm, &addr, false); + else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus)) + r = aia_imsic_addr(dev->kvm, &addr, + type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), false); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + + break; + case KVM_DEV_RISCV_AIA_GRP_APLIC: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = kvm_riscv_aia_aplic_get_attr(dev->kvm, type, &nr); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &nr, sizeof(nr))) + return -EFAULT; + + break; + case KVM_DEV_RISCV_AIA_GRP_IMSIC: + if (copy_from_user(&v, uaddr, sizeof(v))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = kvm_riscv_aia_imsic_rw_attr(dev->kvm, type, false, &v); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &v, sizeof(v))) + return -EFAULT; + + break; + } + + return r; +} + +static int aia_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int nr_vcpus; + + switch (attr->group) { + case KVM_DEV_RISCV_AIA_GRP_CONFIG: + switch (attr->attr) { + case KVM_DEV_RISCV_AIA_CONFIG_MODE: + case KVM_DEV_RISCV_AIA_CONFIG_IDS: + case KVM_DEV_RISCV_AIA_CONFIG_SRCS: + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS: + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT: + case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS: + case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS: + return 0; + } + break; + case KVM_DEV_RISCV_AIA_GRP_ADDR: + nr_vcpus = atomic_read(&dev->kvm->online_vcpus); + if (attr->attr == KVM_DEV_RISCV_AIA_ADDR_APLIC) + return 0; + else if (attr->attr < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus)) + return 0; + break; + case KVM_DEV_RISCV_AIA_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_RISCV_AIA_CTRL_INIT: + return 0; + } + break; + case KVM_DEV_RISCV_AIA_GRP_APLIC: + return kvm_riscv_aia_aplic_has_attr(dev->kvm, attr->attr); + case KVM_DEV_RISCV_AIA_GRP_IMSIC: + return kvm_riscv_aia_imsic_has_attr(dev->kvm, attr->attr); + } + + return -ENXIO; +} + +struct kvm_device_ops kvm_riscv_aia_device_ops = { + .name = "kvm-riscv-aia", + .create = aia_create, + .destroy = aia_destroy, + .set_attr = aia_set_attr, + .get_attr = aia_get_attr, + .has_attr = aia_has_attr, +}; + +int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return 1; + + /* Update the IMSIC HW state before entering guest mode */ + return kvm_riscv_vcpu_aia_imsic_update(vcpu); +} + +void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + + if (!kvm_riscv_aia_available()) + return; + memset(csr, 0, sizeof(*csr)); + + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return; + + /* Reset the IMSIC context */ + kvm_riscv_vcpu_aia_imsic_reset(vcpu); +} + +void kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context; + + if (!kvm_riscv_aia_available()) + return; + + /* + * We don't do any memory allocations over here because these + * will be done after AIA device is initialized by the user-space. + * + * Refer, aia_init() implementation for more details. + */ + + /* Initialize default values in AIA vcpu context */ + vaia->imsic_addr = KVM_RISCV_AIA_UNDEF_ADDR; + vaia->hart_index = vcpu->vcpu_idx; +} + +void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return; + + /* Cleanup IMSIC context */ + kvm_riscv_vcpu_aia_imsic_cleanup(vcpu); +} + +int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index, + u32 guest_index, u32 iid) +{ + unsigned long idx; + struct kvm_vcpu *vcpu; + + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* Inject MSI to matching VCPU */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + if (vcpu->arch.aia_context.hart_index == hart_index) + return kvm_riscv_vcpu_aia_imsic_inject(vcpu, + guest_index, + 0, iid); + } + + return 0; +} + +int kvm_riscv_aia_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + gpa_t tppn, ippn; + unsigned long idx; + struct kvm_vcpu *vcpu; + u32 g, toff, iid = msi->data; + struct kvm_aia *aia = &kvm->arch.aia; + gpa_t target = (((gpa_t)msi->address_hi) << 32) | msi->address_lo; + + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* Convert target address to target PPN */ + tppn = target >> IMSIC_MMIO_PAGE_SHIFT; + + /* Extract and clear Guest ID from target PPN */ + g = tppn & (BIT(aia->nr_guest_bits) - 1); + tppn &= ~((gpa_t)(BIT(aia->nr_guest_bits) - 1)); + + /* Inject MSI to matching VCPU */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + ippn = vcpu->arch.aia_context.imsic_addr >> + IMSIC_MMIO_PAGE_SHIFT; + if (ippn == tppn) { + toff = target & (IMSIC_MMIO_PAGE_SZ - 1); + return kvm_riscv_vcpu_aia_imsic_inject(vcpu, g, + toff, iid); + } + } + + return 0; +} + +int kvm_riscv_aia_inject_irq(struct kvm *kvm, unsigned int irq, bool level) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* Inject interrupt level change in APLIC */ + return kvm_riscv_aia_aplic_inject(kvm, irq, level); +} + +void kvm_riscv_aia_init_vm(struct kvm *kvm) +{ + struct kvm_aia *aia = &kvm->arch.aia; + + if (!kvm_riscv_aia_available()) + return; + + /* + * We don't do any memory allocations over here because these + * will be done after AIA device is initialized by the user-space. + * + * Refer, aia_init() implementation for more details. + */ + + /* Initialize default values in AIA global context */ + aia->mode = (kvm_riscv_aia_nr_hgei) ? + KVM_DEV_RISCV_AIA_MODE_AUTO : KVM_DEV_RISCV_AIA_MODE_EMUL; + aia->nr_ids = kvm_riscv_aia_max_ids - 1; + aia->nr_sources = 0; + aia->nr_group_bits = 0; + aia->nr_group_shift = KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN; + aia->nr_hart_bits = 0; + aia->nr_guest_bits = 0; + aia->aplic_addr = KVM_RISCV_AIA_UNDEF_ADDR; +} + +void kvm_riscv_aia_destroy_vm(struct kvm *kvm) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return; + + /* Cleanup APLIC context */ + kvm_riscv_aia_aplic_cleanup(kvm); +} diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c new file mode 100644 index 000000000000..e597e86491c3 --- /dev/null +++ b/arch/riscv/kvm/aia_imsic.c @@ -0,0 +1,1153 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + * + * Authors: + * Anup Patel <apatel@ventanamicro.com> + */ + +#include <linux/atomic.h> +#include <linux/bitmap.h> +#include <linux/irqchip/riscv-imsic.h> +#include <linux/kvm_host.h> +#include <linux/math.h> +#include <linux/spinlock.h> +#include <linux/swab.h> +#include <kvm/iodev.h> +#include <asm/csr.h> +#include <asm/kvm_mmu.h> + +#define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64)) + +struct imsic_mrif_eix { + unsigned long eip[BITS_PER_TYPE(u64) / BITS_PER_LONG]; + unsigned long eie[BITS_PER_TYPE(u64) / BITS_PER_LONG]; +}; + +struct imsic_mrif { + struct imsic_mrif_eix eix[IMSIC_MAX_EIX]; + unsigned long eithreshold; + unsigned long eidelivery; +}; + +struct imsic { + struct kvm_io_device iodev; + + u32 nr_msis; + u32 nr_eix; + u32 nr_hw_eix; + + /* + * At any point in time, the register state is in + * one of the following places: + * + * 1) Hardware: IMSIC VS-file (vsfile_cpu >= 0) + * 2) Software: IMSIC SW-file (vsfile_cpu < 0) + */ + + /* IMSIC VS-file */ + rwlock_t vsfile_lock; + int vsfile_cpu; + int vsfile_hgei; + void __iomem *vsfile_va; + phys_addr_t vsfile_pa; + + /* IMSIC SW-file */ + struct imsic_mrif *swfile; + phys_addr_t swfile_pa; + raw_spinlock_t swfile_extirq_lock; +}; + +#define imsic_vs_csr_read(__c) \ +({ \ + unsigned long __r; \ + csr_write(CSR_VSISELECT, __c); \ + __r = csr_read(CSR_VSIREG); \ + __r; \ +}) + +#define imsic_read_switchcase(__ireg) \ + case __ireg: \ + return imsic_vs_csr_read(__ireg); +#define imsic_read_switchcase_2(__ireg) \ + imsic_read_switchcase(__ireg + 0) \ + imsic_read_switchcase(__ireg + 1) +#define imsic_read_switchcase_4(__ireg) \ + imsic_read_switchcase_2(__ireg + 0) \ + imsic_read_switchcase_2(__ireg + 2) +#define imsic_read_switchcase_8(__ireg) \ + imsic_read_switchcase_4(__ireg + 0) \ + imsic_read_switchcase_4(__ireg + 4) +#define imsic_read_switchcase_16(__ireg) \ + imsic_read_switchcase_8(__ireg + 0) \ + imsic_read_switchcase_8(__ireg + 8) +#define imsic_read_switchcase_32(__ireg) \ + imsic_read_switchcase_16(__ireg + 0) \ + imsic_read_switchcase_16(__ireg + 16) +#define imsic_read_switchcase_64(__ireg) \ + imsic_read_switchcase_32(__ireg + 0) \ + imsic_read_switchcase_32(__ireg + 32) + +static unsigned long imsic_eix_read(int ireg) +{ + switch (ireg) { + imsic_read_switchcase_64(IMSIC_EIP0) + imsic_read_switchcase_64(IMSIC_EIE0) + } + + return 0; +} + +#define imsic_vs_csr_swap(__c, __v) \ +({ \ + unsigned long __r; \ + csr_write(CSR_VSISELECT, __c); \ + __r = csr_swap(CSR_VSIREG, __v); \ + __r; \ +}) + +#define imsic_swap_switchcase(__ireg, __v) \ + case __ireg: \ + return imsic_vs_csr_swap(__ireg, __v); +#define imsic_swap_switchcase_2(__ireg, __v) \ + imsic_swap_switchcase(__ireg + 0, __v) \ + imsic_swap_switchcase(__ireg + 1, __v) +#define imsic_swap_switchcase_4(__ireg, __v) \ + imsic_swap_switchcase_2(__ireg + 0, __v) \ + imsic_swap_switchcase_2(__ireg + 2, __v) +#define imsic_swap_switchcase_8(__ireg, __v) \ + imsic_swap_switchcase_4(__ireg + 0, __v) \ + imsic_swap_switchcase_4(__ireg + 4, __v) +#define imsic_swap_switchcase_16(__ireg, __v) \ + imsic_swap_switchcase_8(__ireg + 0, __v) \ + imsic_swap_switchcase_8(__ireg + 8, __v) +#define imsic_swap_switchcase_32(__ireg, __v) \ + imsic_swap_switchcase_16(__ireg + 0, __v) \ + imsic_swap_switchcase_16(__ireg + 16, __v) +#define imsic_swap_switchcase_64(__ireg, __v) \ + imsic_swap_switchcase_32(__ireg + 0, __v) \ + imsic_swap_switchcase_32(__ireg + 32, __v) + +static unsigned long imsic_eix_swap(int ireg, unsigned long val) +{ + switch (ireg) { + imsic_swap_switchcase_64(IMSIC_EIP0, val) + imsic_swap_switchcase_64(IMSIC_EIE0, val) + } + + return 0; +} + +#define imsic_vs_csr_write(__c, __v) \ +do { \ + csr_write(CSR_VSISELECT, __c); \ + csr_write(CSR_VSIREG, __v); \ +} while (0) + +#define imsic_write_switchcase(__ireg, __v) \ + case __ireg: \ + imsic_vs_csr_write(__ireg, __v); \ + break; +#define imsic_write_switchcase_2(__ireg, __v) \ + imsic_write_switchcase(__ireg + 0, __v) \ + imsic_write_switchcase(__ireg + 1, __v) +#define imsic_write_switchcase_4(__ireg, __v) \ + imsic_write_switchcase_2(__ireg + 0, __v) \ + imsic_write_switchcase_2(__ireg + 2, __v) +#define imsic_write_switchcase_8(__ireg, __v) \ + imsic_write_switchcase_4(__ireg + 0, __v) \ + imsic_write_switchcase_4(__ireg + 4, __v) +#define imsic_write_switchcase_16(__ireg, __v) \ + imsic_write_switchcase_8(__ireg + 0, __v) \ + imsic_write_switchcase_8(__ireg + 8, __v) +#define imsic_write_switchcase_32(__ireg, __v) \ + imsic_write_switchcase_16(__ireg + 0, __v) \ + imsic_write_switchcase_16(__ireg + 16, __v) +#define imsic_write_switchcase_64(__ireg, __v) \ + imsic_write_switchcase_32(__ireg + 0, __v) \ + imsic_write_switchcase_32(__ireg + 32, __v) + +static void imsic_eix_write(int ireg, unsigned long val) +{ + switch (ireg) { + imsic_write_switchcase_64(IMSIC_EIP0, val) + imsic_write_switchcase_64(IMSIC_EIE0, val) + } +} + +#define imsic_vs_csr_set(__c, __v) \ +do { \ + csr_write(CSR_VSISELECT, __c); \ + csr_set(CSR_VSIREG, __v); \ +} while (0) + +#define imsic_set_switchcase(__ireg, __v) \ + case __ireg: \ + imsic_vs_csr_set(__ireg, __v); \ + break; +#define imsic_set_switchcase_2(__ireg, __v) \ + imsic_set_switchcase(__ireg + 0, __v) \ + imsic_set_switchcase(__ireg + 1, __v) +#define imsic_set_switchcase_4(__ireg, __v) \ + imsic_set_switchcase_2(__ireg + 0, __v) \ + imsic_set_switchcase_2(__ireg + 2, __v) +#define imsic_set_switchcase_8(__ireg, __v) \ + imsic_set_switchcase_4(__ireg + 0, __v) \ + imsic_set_switchcase_4(__ireg + 4, __v) +#define imsic_set_switchcase_16(__ireg, __v) \ + imsic_set_switchcase_8(__ireg + 0, __v) \ + imsic_set_switchcase_8(__ireg + 8, __v) +#define imsic_set_switchcase_32(__ireg, __v) \ + imsic_set_switchcase_16(__ireg + 0, __v) \ + imsic_set_switchcase_16(__ireg + 16, __v) +#define imsic_set_switchcase_64(__ireg, __v) \ + imsic_set_switchcase_32(__ireg + 0, __v) \ + imsic_set_switchcase_32(__ireg + 32, __v) + +static void imsic_eix_set(int ireg, unsigned long val) +{ + switch (ireg) { + imsic_set_switchcase_64(IMSIC_EIP0, val) + imsic_set_switchcase_64(IMSIC_EIE0, val) + } +} + +static unsigned long imsic_mrif_atomic_rmw(struct imsic_mrif *mrif, + unsigned long *ptr, + unsigned long new_val, + unsigned long wr_mask) +{ + unsigned long old_val = 0, tmp = 0; + + __asm__ __volatile__ ( + "0: lr.w.aq %1, %0\n" + " and %2, %1, %3\n" + " or %2, %2, %4\n" + " sc.w.rl %2, %2, %0\n" + " bnez %2, 0b" + : "+A" (*ptr), "+r" (old_val), "+r" (tmp) + : "r" (~wr_mask), "r" (new_val & wr_mask) + : "memory"); + + return old_val; +} + +static unsigned long imsic_mrif_atomic_or(struct imsic_mrif *mrif, + unsigned long *ptr, + unsigned long val) +{ + return atomic_long_fetch_or(val, (atomic_long_t *)ptr); +} + +#define imsic_mrif_atomic_write(__mrif, __ptr, __new_val) \ + imsic_mrif_atomic_rmw(__mrif, __ptr, __new_val, -1UL) +#define imsic_mrif_atomic_read(__mrif, __ptr) \ + imsic_mrif_atomic_or(__mrif, __ptr, 0) + +static u32 imsic_mrif_topei(struct imsic_mrif *mrif, u32 nr_eix, u32 nr_msis) +{ + struct imsic_mrif_eix *eix; + u32 i, imin, imax, ei, max_msi; + unsigned long eipend[BITS_PER_TYPE(u64) / BITS_PER_LONG]; + unsigned long eithreshold = imsic_mrif_atomic_read(mrif, + &mrif->eithreshold); + + max_msi = (eithreshold && (eithreshold <= nr_msis)) ? + eithreshold : nr_msis; + for (ei = 0; ei < nr_eix; ei++) { + eix = &mrif->eix[ei]; + eipend[0] = imsic_mrif_atomic_read(mrif, &eix->eie[0]) & + imsic_mrif_atomic_read(mrif, &eix->eip[0]); +#ifdef CONFIG_32BIT + eipend[1] = imsic_mrif_atomic_read(mrif, &eix->eie[1]) & + imsic_mrif_atomic_read(mrif, &eix->eip[1]); + if (!eipend[0] && !eipend[1]) +#else + if (!eipend[0]) +#endif + continue; + + imin = ei * BITS_PER_TYPE(u64); + imax = ((imin + BITS_PER_TYPE(u64)) < max_msi) ? + imin + BITS_PER_TYPE(u64) : max_msi; + for (i = (!imin) ? 1 : imin; i < imax; i++) { + if (test_bit(i - imin, eipend)) + return (i << TOPEI_ID_SHIFT) | i; + } + } + + return 0; +} + +static int imsic_mrif_isel_check(u32 nr_eix, unsigned long isel) +{ + u32 num = 0; + + switch (isel) { + case IMSIC_EIDELIVERY: + case IMSIC_EITHRESHOLD: + break; + case IMSIC_EIP0 ... IMSIC_EIP63: + num = isel - IMSIC_EIP0; + break; + case IMSIC_EIE0 ... IMSIC_EIE63: + num = isel - IMSIC_EIE0; + break; + default: + return -ENOENT; + } +#ifndef CONFIG_32BIT + if (num & 0x1) + return -EINVAL; +#endif + if ((num / 2) >= nr_eix) + return -EINVAL; + + return 0; +} + +static int imsic_mrif_rmw(struct imsic_mrif *mrif, u32 nr_eix, + unsigned long isel, unsigned long *val, + unsigned long new_val, unsigned long wr_mask) +{ + bool pend; + struct imsic_mrif_eix *eix; + unsigned long *ei, num, old_val = 0; + + switch (isel) { + case IMSIC_EIDELIVERY: + old_val = imsic_mrif_atomic_rmw(mrif, &mrif->eidelivery, + new_val, wr_mask & 0x1); + break; + case IMSIC_EITHRESHOLD: + old_val = imsic_mrif_atomic_rmw(mrif, &mrif->eithreshold, + new_val, wr_mask & (IMSIC_MAX_ID - 1)); + break; + case IMSIC_EIP0 ... IMSIC_EIP63: + case IMSIC_EIE0 ... IMSIC_EIE63: + if (isel >= IMSIC_EIP0 && isel <= IMSIC_EIP63) { + pend = true; + num = isel - IMSIC_EIP0; + } else { + pend = false; + num = isel - IMSIC_EIE0; + } + + if ((num / 2) >= nr_eix) + return -EINVAL; + eix = &mrif->eix[num / 2]; + +#ifndef CONFIG_32BIT + if (num & 0x1) + return -EINVAL; + ei = (pend) ? &eix->eip[0] : &eix->eie[0]; +#else + ei = (pend) ? &eix->eip[num & 0x1] : &eix->eie[num & 0x1]; +#endif + + /* Bit0 of EIP0 or EIE0 is read-only */ + if (!num) + wr_mask &= ~BIT(0); + + old_val = imsic_mrif_atomic_rmw(mrif, ei, new_val, wr_mask); + break; + default: + return -ENOENT; + } + + if (val) + *val = old_val; + + return 0; +} + +struct imsic_vsfile_read_data { + int hgei; + u32 nr_eix; + bool clear; + struct imsic_mrif *mrif; +}; + +static void imsic_vsfile_local_read(void *data) +{ + u32 i; + struct imsic_mrif_eix *eix; + struct imsic_vsfile_read_data *idata = data; + struct imsic_mrif *mrif = idata->mrif; + unsigned long new_hstatus, old_hstatus, old_vsiselect; + + old_vsiselect = csr_read(CSR_VSISELECT); + old_hstatus = csr_read(CSR_HSTATUS); + new_hstatus = old_hstatus & ~HSTATUS_VGEIN; + new_hstatus |= ((unsigned long)idata->hgei) << HSTATUS_VGEIN_SHIFT; + csr_write(CSR_HSTATUS, new_hstatus); + + /* + * We don't use imsic_mrif_atomic_xyz() functions to store + * values in MRIF because imsic_vsfile_read() is always called + * with pointer to temporary MRIF on stack. + */ + + if (idata->clear) { + mrif->eidelivery = imsic_vs_csr_swap(IMSIC_EIDELIVERY, 0); + mrif->eithreshold = imsic_vs_csr_swap(IMSIC_EITHRESHOLD, 0); + for (i = 0; i < idata->nr_eix; i++) { + eix = &mrif->eix[i]; + eix->eip[0] = imsic_eix_swap(IMSIC_EIP0 + i * 2, 0); + eix->eie[0] = imsic_eix_swap(IMSIC_EIE0 + i * 2, 0); +#ifdef CONFIG_32BIT + eix->eip[1] = imsic_eix_swap(IMSIC_EIP0 + i * 2 + 1, 0); + eix->eie[1] = imsic_eix_swap(IMSIC_EIE0 + i * 2 + 1, 0); +#endif + } + } else { + mrif->eidelivery = imsic_vs_csr_read(IMSIC_EIDELIVERY); + mrif->eithreshold = imsic_vs_csr_read(IMSIC_EITHRESHOLD); + for (i = 0; i < idata->nr_eix; i++) { + eix = &mrif->eix[i]; + eix->eip[0] = imsic_eix_read(IMSIC_EIP0 + i * 2); + eix->eie[0] = imsic_eix_read(IMSIC_EIE0 + i * 2); +#ifdef CONFIG_32BIT + eix->eip[1] = imsic_eix_read(IMSIC_EIP0 + i * 2 + 1); + eix->eie[1] = imsic_eix_read(IMSIC_EIE0 + i * 2 + 1); +#endif + } + } + + csr_write(CSR_HSTATUS, old_hstatus); + csr_write(CSR_VSISELECT, old_vsiselect); +} + +static void imsic_vsfile_read(int vsfile_hgei, int vsfile_cpu, u32 nr_eix, + bool clear, struct imsic_mrif *mrif) +{ + struct imsic_vsfile_read_data idata; + + /* We can only read clear if we have a IMSIC VS-file */ + if (vsfile_cpu < 0 || vsfile_hgei <= 0) + return; + + /* We can only read clear on local CPU */ + idata.hgei = vsfile_hgei; + idata.nr_eix = nr_eix; + idata.clear = clear; + idata.mrif = mrif; + on_each_cpu_mask(cpumask_of(vsfile_cpu), + imsic_vsfile_local_read, &idata, 1); +} + +struct imsic_vsfile_rw_data { + int hgei; + int isel; + bool write; + unsigned long val; +}; + +static void imsic_vsfile_local_rw(void *data) +{ + struct imsic_vsfile_rw_data *idata = data; + unsigned long new_hstatus, old_hstatus, old_vsiselect; + + old_vsiselect = csr_read(CSR_VSISELECT); + old_hstatus = csr_read(CSR_HSTATUS); + new_hstatus = old_hstatus & ~HSTATUS_VGEIN; + new_hstatus |= ((unsigned long)idata->hgei) << HSTATUS_VGEIN_SHIFT; + csr_write(CSR_HSTATUS, new_hstatus); + + switch (idata->isel) { + case IMSIC_EIDELIVERY: + if (idata->write) + imsic_vs_csr_write(IMSIC_EIDELIVERY, idata->val); + else + idata->val = imsic_vs_csr_read(IMSIC_EIDELIVERY); + break; + case IMSIC_EITHRESHOLD: + if (idata->write) + imsic_vs_csr_write(IMSIC_EITHRESHOLD, idata->val); + else + idata->val = imsic_vs_csr_read(IMSIC_EITHRESHOLD); + break; + case IMSIC_EIP0 ... IMSIC_EIP63: + case IMSIC_EIE0 ... IMSIC_EIE63: +#ifndef CONFIG_32BIT + if (idata->isel & 0x1) + break; +#endif + if (idata->write) + imsic_eix_write(idata->isel, idata->val); + else + idata->val = imsic_eix_read(idata->isel); + break; + default: + break; + } + + csr_write(CSR_HSTATUS, old_hstatus); + csr_write(CSR_VSISELECT, old_vsiselect); +} + +static int imsic_vsfile_rw(int vsfile_hgei, int vsfile_cpu, u32 nr_eix, + unsigned long isel, bool write, + unsigned long *val) +{ + int rc; + struct imsic_vsfile_rw_data rdata; + + /* We can only access register if we have a IMSIC VS-file */ + if (vsfile_cpu < 0 || vsfile_hgei <= 0) + return -EINVAL; + + /* Check IMSIC register iselect */ + rc = imsic_mrif_isel_check(nr_eix, isel); + if (rc) + return rc; + + /* We can only access register on local CPU */ + rdata.hgei = vsfile_hgei; + rdata.isel = isel; + rdata.write = write; + rdata.val = (write) ? *val : 0; + on_each_cpu_mask(cpumask_of(vsfile_cpu), + imsic_vsfile_local_rw, &rdata, 1); + + if (!write) + *val = rdata.val; + + return 0; +} + +static void imsic_vsfile_local_clear(int vsfile_hgei, u32 nr_eix) +{ + u32 i; + unsigned long new_hstatus, old_hstatus, old_vsiselect; + + /* We can only zero-out if we have a IMSIC VS-file */ + if (vsfile_hgei <= 0) + return; + + old_vsiselect = csr_read(CSR_VSISELECT); + old_hstatus = csr_read(CSR_HSTATUS); + new_hstatus = old_hstatus & ~HSTATUS_VGEIN; + new_hstatus |= ((unsigned long)vsfile_hgei) << HSTATUS_VGEIN_SHIFT; + csr_write(CSR_HSTATUS, new_hstatus); + + imsic_vs_csr_write(IMSIC_EIDELIVERY, 0); + imsic_vs_csr_write(IMSIC_EITHRESHOLD, 0); + for (i = 0; i < nr_eix; i++) { + imsic_eix_write(IMSIC_EIP0 + i * 2, 0); + imsic_eix_write(IMSIC_EIE0 + i * 2, 0); +#ifdef CONFIG_32BIT + imsic_eix_write(IMSIC_EIP0 + i * 2 + 1, 0); + imsic_eix_write(IMSIC_EIE0 + i * 2 + 1, 0); +#endif + } + + csr_write(CSR_HSTATUS, old_hstatus); + csr_write(CSR_VSISELECT, old_vsiselect); +} + +static void imsic_vsfile_local_update(int vsfile_hgei, u32 nr_eix, + struct imsic_mrif *mrif) +{ + u32 i; + struct imsic_mrif_eix *eix; + unsigned long new_hstatus, old_hstatus, old_vsiselect; + + /* We can only update if we have a HW IMSIC context */ + if (vsfile_hgei <= 0) + return; + + /* + * We don't use imsic_mrif_atomic_xyz() functions to read values + * from MRIF in this function because it is always called with + * pointer to temporary MRIF on stack. + */ + + old_vsiselect = csr_read(CSR_VSISELECT); + old_hstatus = csr_read(CSR_HSTATUS); + new_hstatus = old_hstatus & ~HSTATUS_VGEIN; + new_hstatus |= ((unsigned long)vsfile_hgei) << HSTATUS_VGEIN_SHIFT; + csr_write(CSR_HSTATUS, new_hstatus); + + for (i = 0; i < nr_eix; i++) { + eix = &mrif->eix[i]; + imsic_eix_set(IMSIC_EIP0 + i * 2, eix->eip[0]); + imsic_eix_set(IMSIC_EIE0 + i * 2, eix->eie[0]); +#ifdef CONFIG_32BIT + imsic_eix_set(IMSIC_EIP0 + i * 2 + 1, eix->eip[1]); + imsic_eix_set(IMSIC_EIE0 + i * 2 + 1, eix->eie[1]); +#endif + } + imsic_vs_csr_write(IMSIC_EITHRESHOLD, mrif->eithreshold); + imsic_vs_csr_write(IMSIC_EIDELIVERY, mrif->eidelivery); + + csr_write(CSR_HSTATUS, old_hstatus); + csr_write(CSR_VSISELECT, old_vsiselect); +} + +static void imsic_vsfile_cleanup(struct imsic *imsic) +{ + int old_vsfile_hgei, old_vsfile_cpu; + unsigned long flags; + + /* + * We don't use imsic_mrif_atomic_xyz() functions to clear the + * SW-file in this function because it is always called when the + * VCPU is being destroyed. + */ + + write_lock_irqsave(&imsic->vsfile_lock, flags); + old_vsfile_hgei = imsic->vsfile_hgei; + old_vsfile_cpu = imsic->vsfile_cpu; + imsic->vsfile_cpu = imsic->vsfile_hgei = -1; + imsic->vsfile_va = NULL; + imsic->vsfile_pa = 0; + write_unlock_irqrestore(&imsic->vsfile_lock, flags); + + memset(imsic->swfile, 0, sizeof(*imsic->swfile)); + + if (old_vsfile_cpu >= 0) + kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei); +} + +static void imsic_swfile_extirq_update(struct kvm_vcpu *vcpu) +{ + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + struct imsic_mrif *mrif = imsic->swfile; + unsigned long flags; + + /* + * The critical section is necessary during external interrupt + * updates to avoid the risk of losing interrupts due to potential + * interruptions between reading topei and updating pending status. + */ + + raw_spin_lock_irqsave(&imsic->swfile_extirq_lock, flags); + + if (imsic_mrif_atomic_read(mrif, &mrif->eidelivery) && + imsic_mrif_topei(mrif, imsic->nr_eix, imsic->nr_msis)) + kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_VS_EXT); + else + kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT); + + raw_spin_unlock_irqrestore(&imsic->swfile_extirq_lock, flags); +} + +static void imsic_swfile_read(struct kvm_vcpu *vcpu, bool clear, + struct imsic_mrif *mrif) +{ + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + /* + * We don't use imsic_mrif_atomic_xyz() functions to read and + * write SW-file and MRIF in this function because it is always + * called when VCPU is not using SW-file and the MRIF points to + * a temporary MRIF on stack. + */ + + memcpy(mrif, imsic->swfile, sizeof(*mrif)); + if (clear) { + memset(imsic->swfile, 0, sizeof(*imsic->swfile)); + kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT); + } +} + +static void imsic_swfile_update(struct kvm_vcpu *vcpu, + struct imsic_mrif *mrif) +{ + u32 i; + struct imsic_mrif_eix *seix, *eix; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + struct imsic_mrif *smrif = imsic->swfile; + + imsic_mrif_atomic_write(smrif, &smrif->eidelivery, mrif->eidelivery); + imsic_mrif_atomic_write(smrif, &smrif->eithreshold, mrif->eithreshold); + for (i = 0; i < imsic->nr_eix; i++) { + seix = &smrif->eix[i]; + eix = &mrif->eix[i]; + imsic_mrif_atomic_or(smrif, &seix->eip[0], eix->eip[0]); + imsic_mrif_atomic_or(smrif, &seix->eie[0], eix->eie[0]); +#ifdef CONFIG_32BIT + imsic_mrif_atomic_or(smrif, &seix->eip[1], eix->eip[1]); + imsic_mrif_atomic_or(smrif, &seix->eie[1], eix->eie[1]); +#endif + } + + imsic_swfile_extirq_update(vcpu); +} + +bool kvm_riscv_vcpu_aia_imsic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + unsigned long flags; + bool ret = false; + + /* + * The IMSIC SW-file directly injects interrupt via hvip so + * only check for interrupt when IMSIC VS-file is being used. + */ + + read_lock_irqsave(&imsic->vsfile_lock, flags); + if (imsic->vsfile_cpu > -1) { + /* + * This function is typically called from kvm_vcpu_block() via + * kvm_arch_vcpu_runnable() upon WFI trap. The kvm_vcpu_block() + * can be preempted and the blocking VCPU might resume on a + * different CPU. This means it is possible that current CPU + * does not match the imsic->vsfile_cpu hence this function + * must check imsic->vsfile_cpu before accessing HGEIP CSR. + */ + if (imsic->vsfile_cpu != vcpu->cpu) + ret = true; + else + ret = !!(csr_read(CSR_HGEIP) & BIT(imsic->vsfile_hgei)); + } + read_unlock_irqrestore(&imsic->vsfile_lock, flags); + + return ret; +} + +void kvm_riscv_vcpu_aia_imsic_load(struct kvm_vcpu *vcpu, int cpu) +{ + /* + * No need to explicitly clear HGEIE CSR bits because the + * hgei interrupt handler (aka hgei_interrupt()) will always + * clear it for us. + */ +} + +void kvm_riscv_vcpu_aia_imsic_put(struct kvm_vcpu *vcpu) +{ + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + unsigned long flags; + + if (!kvm_vcpu_is_blocking(vcpu)) + return; + + read_lock_irqsave(&imsic->vsfile_lock, flags); + if (imsic->vsfile_cpu > -1) + csr_set(CSR_HGEIE, BIT(imsic->vsfile_hgei)); + read_unlock_irqrestore(&imsic->vsfile_lock, flags); +} + +void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + struct imsic_mrif tmrif; + int old_vsfile_hgei, old_vsfile_cpu; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + /* Read and clear IMSIC VS-file details */ + write_lock_irqsave(&imsic->vsfile_lock, flags); + old_vsfile_hgei = imsic->vsfile_hgei; + old_vsfile_cpu = imsic->vsfile_cpu; + imsic->vsfile_cpu = imsic->vsfile_hgei = -1; + imsic->vsfile_va = NULL; + imsic->vsfile_pa = 0; + write_unlock_irqrestore(&imsic->vsfile_lock, flags); + + /* Do nothing, if no IMSIC VS-file to release */ + if (old_vsfile_cpu < 0) + return; + + /* + * At this point, all interrupt producers are still using + * the old IMSIC VS-file so we first re-direct all interrupt + * producers. + */ + + /* Purge the G-stage mapping */ + kvm_riscv_mmu_iounmap(vcpu->kvm, vcpu->arch.aia_context.imsic_addr, + IMSIC_MMIO_PAGE_SZ); + + /* TODO: Purge the IOMMU mapping ??? */ + + /* + * At this point, all interrupt producers have been re-directed + * to somewhere else so we move register state from the old IMSIC + * VS-file to the IMSIC SW-file. + */ + + /* Read and clear register state from old IMSIC VS-file */ + memset(&tmrif, 0, sizeof(tmrif)); + imsic_vsfile_read(old_vsfile_hgei, old_vsfile_cpu, imsic->nr_hw_eix, + true, &tmrif); + + /* Update register state in IMSIC SW-file */ + imsic_swfile_update(vcpu, &tmrif); + + /* Free-up old IMSIC VS-file */ + kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei); +} + +int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + phys_addr_t new_vsfile_pa; + struct imsic_mrif tmrif; + void __iomem *new_vsfile_va; + struct kvm *kvm = vcpu->kvm; + struct kvm_run *run = vcpu->run; + struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context; + struct imsic *imsic = vaia->imsic_state; + int ret = 0, new_vsfile_hgei = -1, old_vsfile_hgei, old_vsfile_cpu; + + /* Do nothing for emulation mode */ + if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_EMUL) + return 1; + + /* Read old IMSIC VS-file details */ + read_lock_irqsave(&imsic->vsfile_lock, flags); + old_vsfile_hgei = imsic->vsfile_hgei; + old_vsfile_cpu = imsic->vsfile_cpu; + read_unlock_irqrestore(&imsic->vsfile_lock, flags); + + /* Do nothing if we are continuing on same CPU */ + if (old_vsfile_cpu == vcpu->cpu) + return 1; + + /* Allocate new IMSIC VS-file */ + ret = kvm_riscv_aia_alloc_hgei(vcpu->cpu, vcpu, + &new_vsfile_va, &new_vsfile_pa); + if (ret <= 0) { + /* For HW acceleration mode, we can't continue */ + if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_HWACCEL) { + run->fail_entry.hardware_entry_failure_reason = + KVM_EXIT_FAIL_ENTRY_NO_VSFILE; + run->fail_entry.cpu = vcpu->cpu; + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return 0; + } + + /* Release old IMSIC VS-file */ + if (old_vsfile_cpu >= 0) + kvm_riscv_vcpu_aia_imsic_release(vcpu); + + /* For automatic mode, we continue */ + goto done; + } + new_vsfile_hgei = ret; + + /* + * At this point, all interrupt producers are still using + * to the old IMSIC VS-file so we first move all interrupt + * producers to the new IMSIC VS-file. + */ + + /* Ensure HGEIE CSR bit is zero before using the new IMSIC VS-file */ + csr_clear(CSR_HGEIE, BIT(new_vsfile_hgei)); + + /* Zero-out new IMSIC VS-file */ + imsic_vsfile_local_clear(new_vsfile_hgei, imsic->nr_hw_eix); + + /* Update G-stage mapping for the new IMSIC VS-file */ + ret = kvm_riscv_mmu_ioremap(kvm, vcpu->arch.aia_context.imsic_addr, + new_vsfile_pa, IMSIC_MMIO_PAGE_SZ, + true, true); + if (ret) + goto fail_free_vsfile_hgei; + + /* TODO: Update the IOMMU mapping ??? */ + + /* Update new IMSIC VS-file details in IMSIC context */ + write_lock_irqsave(&imsic->vsfile_lock, flags); + imsic->vsfile_hgei = new_vsfile_hgei; + imsic->vsfile_cpu = vcpu->cpu; + imsic->vsfile_va = new_vsfile_va; + imsic->vsfile_pa = new_vsfile_pa; + write_unlock_irqrestore(&imsic->vsfile_lock, flags); + + /* + * At this point, all interrupt producers have been moved + * to the new IMSIC VS-file so we move register state from + * the old IMSIC VS/SW-file to the new IMSIC VS-file. + */ + + memset(&tmrif, 0, sizeof(tmrif)); + if (old_vsfile_cpu >= 0) { + /* Read and clear register state from old IMSIC VS-file */ + imsic_vsfile_read(old_vsfile_hgei, old_vsfile_cpu, + imsic->nr_hw_eix, true, &tmrif); + + /* Free-up old IMSIC VS-file */ + kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei); + } else { + /* Read and clear register state from IMSIC SW-file */ + imsic_swfile_read(vcpu, true, &tmrif); + } + + /* Restore register state in the new IMSIC VS-file */ + imsic_vsfile_local_update(new_vsfile_hgei, imsic->nr_hw_eix, &tmrif); + +done: + /* Set VCPU HSTATUS.VGEIN to new IMSIC VS-file */ + vcpu->arch.guest_context.hstatus &= ~HSTATUS_VGEIN; + if (new_vsfile_hgei > 0) + vcpu->arch.guest_context.hstatus |= + ((unsigned long)new_vsfile_hgei) << HSTATUS_VGEIN_SHIFT; + + /* Continue run-loop */ + return 1; + +fail_free_vsfile_hgei: + kvm_riscv_aia_free_hgei(vcpu->cpu, new_vsfile_hgei); + return ret; +} + +int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + u32 topei; + struct imsic_mrif_eix *eix; + int r, rc = KVM_INSN_CONTINUE_NEXT_SEPC; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + if (isel == KVM_RISCV_AIA_IMSIC_TOPEI) { + /* Read pending and enabled interrupt with highest priority */ + topei = imsic_mrif_topei(imsic->swfile, imsic->nr_eix, + imsic->nr_msis); + if (val) + *val = topei; + + /* Writes ignore value and clear top pending interrupt */ + if (topei && wr_mask) { + topei >>= TOPEI_ID_SHIFT; + if (topei) { + eix = &imsic->swfile->eix[topei / + BITS_PER_TYPE(u64)]; + clear_bit(topei & (BITS_PER_TYPE(u64) - 1), + eix->eip); + } + } + } else { + r = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix, isel, + val, new_val, wr_mask); + /* Forward unknown IMSIC register to user-space */ + if (r) + rc = (r == -ENOENT) ? 0 : KVM_INSN_ILLEGAL_TRAP; + } + + if (wr_mask) + imsic_swfile_extirq_update(vcpu); + + return rc; +} + +int kvm_riscv_aia_imsic_rw_attr(struct kvm *kvm, unsigned long type, + bool write, unsigned long *val) +{ + u32 isel, vcpu_id; + unsigned long flags; + struct imsic *imsic; + struct kvm_vcpu *vcpu; + int rc, vsfile_hgei, vsfile_cpu; + + if (!kvm_riscv_aia_initialized(kvm)) + return -ENODEV; + + vcpu_id = KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(type); + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + if (!vcpu) + return -ENODEV; + + isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type); + imsic = vcpu->arch.aia_context.imsic_state; + + read_lock_irqsave(&imsic->vsfile_lock, flags); + + rc = 0; + vsfile_hgei = imsic->vsfile_hgei; + vsfile_cpu = imsic->vsfile_cpu; + if (vsfile_cpu < 0) { + if (write) { + rc = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix, + isel, NULL, *val, -1UL); + imsic_swfile_extirq_update(vcpu); + } else + rc = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix, + isel, val, 0, 0); + } + + read_unlock_irqrestore(&imsic->vsfile_lock, flags); + + if (!rc && vsfile_cpu >= 0) + rc = imsic_vsfile_rw(vsfile_hgei, vsfile_cpu, imsic->nr_eix, + isel, write, val); + + return rc; +} + +int kvm_riscv_aia_imsic_has_attr(struct kvm *kvm, unsigned long type) +{ + u32 isel, vcpu_id; + struct imsic *imsic; + struct kvm_vcpu *vcpu; + + if (!kvm_riscv_aia_initialized(kvm)) + return -ENODEV; + + vcpu_id = KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(type); + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + if (!vcpu) + return -ENODEV; + + isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type); + imsic = vcpu->arch.aia_context.imsic_state; + return imsic_mrif_isel_check(imsic->nr_eix, isel); +} + +void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu) +{ + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + if (!imsic) + return; + + kvm_riscv_vcpu_aia_imsic_release(vcpu); + + memset(imsic->swfile, 0, sizeof(*imsic->swfile)); +} + +int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, + u32 guest_index, u32 offset, u32 iid) +{ + unsigned long flags; + struct imsic_mrif_eix *eix; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + /* We only emulate one IMSIC MMIO page for each Guest VCPU */ + if (!imsic || !iid || guest_index || + (offset != IMSIC_MMIO_SETIPNUM_LE && + offset != IMSIC_MMIO_SETIPNUM_BE)) + return -ENODEV; + + iid = (offset == IMSIC_MMIO_SETIPNUM_BE) ? __swab32(iid) : iid; + if (imsic->nr_msis <= iid) + return -EINVAL; + + read_lock_irqsave(&imsic->vsfile_lock, flags); + + if (imsic->vsfile_cpu >= 0) { + writel(iid, imsic->vsfile_va + IMSIC_MMIO_SETIPNUM_LE); + } else { + eix = &imsic->swfile->eix[iid / BITS_PER_TYPE(u64)]; + set_bit(iid & (BITS_PER_TYPE(u64) - 1), eix->eip); + imsic_swfile_extirq_update(vcpu); + } + + read_unlock_irqrestore(&imsic->vsfile_lock, flags); + + return 0; +} + +static int imsic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + if (len != 4 || (addr & 0x3) != 0) + return -EOPNOTSUPP; + + *((u32 *)val) = 0; + + return 0; +} + +static int imsic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + struct kvm_msi msi = { 0 }; + + if (len != 4 || (addr & 0x3) != 0) + return -EOPNOTSUPP; + + msi.address_hi = addr >> 32; + msi.address_lo = (u32)addr; + msi.data = *((const u32 *)val); + kvm_riscv_aia_inject_msi(vcpu->kvm, &msi); + + return 0; +}; + +static struct kvm_io_device_ops imsic_iodoev_ops = { + .read = imsic_mmio_read, + .write = imsic_mmio_write, +}; + +int kvm_riscv_vcpu_aia_imsic_init(struct kvm_vcpu *vcpu) +{ + int ret = 0; + struct imsic *imsic; + struct page *swfile_page; + struct kvm *kvm = vcpu->kvm; + + /* Fail if we have zero IDs */ + if (!kvm->arch.aia.nr_ids) + return -EINVAL; + + /* Allocate IMSIC context */ + imsic = kzalloc(sizeof(*imsic), GFP_KERNEL); + if (!imsic) + return -ENOMEM; + vcpu->arch.aia_context.imsic_state = imsic; + + /* Setup IMSIC context */ + imsic->nr_msis = kvm->arch.aia.nr_ids + 1; + rwlock_init(&imsic->vsfile_lock); + imsic->nr_eix = BITS_TO_U64(imsic->nr_msis); + imsic->nr_hw_eix = BITS_TO_U64(kvm_riscv_aia_max_ids); + imsic->vsfile_hgei = imsic->vsfile_cpu = -1; + + /* Setup IMSIC SW-file */ + swfile_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(sizeof(*imsic->swfile))); + if (!swfile_page) { + ret = -ENOMEM; + goto fail_free_imsic; + } + imsic->swfile = page_to_virt(swfile_page); + imsic->swfile_pa = page_to_phys(swfile_page); + raw_spin_lock_init(&imsic->swfile_extirq_lock); + + /* Setup IO device */ + kvm_iodevice_init(&imsic->iodev, &imsic_iodoev_ops); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, + vcpu->arch.aia_context.imsic_addr, + KVM_DEV_RISCV_IMSIC_SIZE, + &imsic->iodev); + mutex_unlock(&kvm->slots_lock); + if (ret) + goto fail_free_swfile; + + return 0; + +fail_free_swfile: + free_pages((unsigned long)imsic->swfile, + get_order(sizeof(*imsic->swfile))); +fail_free_imsic: + vcpu->arch.aia_context.imsic_state = NULL; + kfree(imsic); + return ret; +} + +void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + if (!imsic) + return; + + imsic_vsfile_cleanup(imsic); + + mutex_lock(&kvm->slots_lock); + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &imsic->iodev); + mutex_unlock(&kvm->slots_lock); + + free_pages((unsigned long)imsic->swfile, + get_order(sizeof(*imsic->swfile))); + + vcpu->arch.aia_context.imsic_state = NULL; + kfree(imsic); +} diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c new file mode 100644 index 000000000000..b67d60d722c2 --- /dev/null +++ b/arch/riscv/kvm/gstage.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * Copyright (c) 2025 Ventana Micro Systems Inc. + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/pgtable.h> +#include <asm/kvm_gstage.h> + +#ifdef CONFIG_64BIT +unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4; +unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3; +#else +unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4; +unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2; +#endif + +#define gstage_pte_leaf(__ptep) \ + (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) + +static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) +{ + unsigned long mask; + unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level); + + if (level == (kvm_riscv_gstage_pgd_levels - 1)) + mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1; + else + mask = PTRS_PER_PTE - 1; + + return (addr >> shift) & mask; +} + +static inline unsigned long gstage_pte_page_vaddr(pte_t pte) +{ + return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte))); +} + +static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) +{ + u32 i; + unsigned long psz = 1UL << 12; + + for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) { + if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) { + *out_level = i; + return 0; + } + } + + return -EINVAL; +} + +static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder) +{ + if (kvm_riscv_gstage_pgd_levels < level) + return -EINVAL; + + *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits); + return 0; +} + +static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) +{ + int rc; + unsigned long page_order = PAGE_SHIFT; + + rc = gstage_level_to_page_order(level, &page_order); + if (rc) + return rc; + + *out_pgsize = BIT(page_order); + return 0; +} + +bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, + pte_t **ptepp, u32 *ptep_level) +{ + pte_t *ptep; + u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + + *ptep_level = current_level; + ptep = (pte_t *)gstage->pgd; + ptep = &ptep[gstage_pte_index(addr, current_level)]; + while (ptep && pte_val(ptep_get(ptep))) { + if (gstage_pte_leaf(ptep)) { + *ptep_level = current_level; + *ptepp = ptep; + return true; + } + + if (current_level) { + current_level--; + *ptep_level = current_level; + ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); + ptep = &ptep[gstage_pte_index(addr, current_level)]; + } else { + ptep = NULL; + } + } + + return false; +} + +static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr) +{ + unsigned long order = PAGE_SHIFT; + + if (gstage_level_to_page_order(level, &order)) + return; + addr &= ~(BIT(order) - 1); + + if (gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) + kvm_riscv_local_hfence_gvma_vmid_gpa(gstage->vmid, addr, BIT(order), order); + else + kvm_riscv_hfence_gvma_vmid_gpa(gstage->kvm, -1UL, 0, addr, BIT(order), order, + gstage->vmid); +} + +int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + const struct kvm_gstage_mapping *map) +{ + u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + pte_t *next_ptep = (pte_t *)gstage->pgd; + pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; + + if (current_level < map->level) + return -EINVAL; + + while (current_level != map->level) { + if (gstage_pte_leaf(ptep)) + return -EEXIST; + + if (!pte_val(ptep_get(ptep))) { + if (!pcache) + return -ENOMEM; + next_ptep = kvm_mmu_memory_cache_alloc(pcache); + if (!next_ptep) + return -ENOMEM; + set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)), + __pgprot(_PAGE_TABLE))); + } else { + if (gstage_pte_leaf(ptep)) + return -EEXIST; + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); + } + + current_level--; + ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; + } + + if (pte_val(*ptep) != pte_val(map->pte)) { + set_pte(ptep, map->pte); + if (gstage_pte_leaf(ptep)) + gstage_tlb_flush(gstage, current_level, map->addr); + } + + return 0; +} + +int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + gpa_t gpa, phys_addr_t hpa, unsigned long page_size, + bool page_rdonly, bool page_exec, + struct kvm_gstage_mapping *out_map) +{ + pgprot_t prot; + int ret; + + out_map->addr = gpa; + out_map->level = 0; + + ret = gstage_page_size_to_level(page_size, &out_map->level); + if (ret) + return ret; + + /* + * A RISC-V implementation can choose to either: + * 1) Update 'A' and 'D' PTE bits in hardware + * 2) Generate page fault when 'A' and/or 'D' bits are not set + * PTE so that software can update these bits. + * + * We support both options mentioned above. To achieve this, we + * always set 'A' and 'D' PTE bits at time of creating G-stage + * mapping. To support KVM dirty page logging with both options + * mentioned above, we will write-protect G-stage PTEs to track + * dirty pages. + */ + + if (page_exec) { + if (page_rdonly) + prot = PAGE_READ_EXEC; + else + prot = PAGE_WRITE_EXEC; + } else { + if (page_rdonly) + prot = PAGE_READ; + else + prot = PAGE_WRITE; + } + out_map->pte = pfn_pte(PFN_DOWN(hpa), prot); + out_map->pte = pte_mkdirty(out_map->pte); + + return kvm_riscv_gstage_set_pte(gstage, pcache, out_map); +} + +void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, + pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op) +{ + int i, ret; + pte_t old_pte, *next_ptep; + u32 next_ptep_level; + unsigned long next_page_size, page_size; + + ret = gstage_level_to_page_size(ptep_level, &page_size); + if (ret) + return; + + WARN_ON(addr & (page_size - 1)); + + if (!pte_val(ptep_get(ptep))) + return; + + if (ptep_level && !gstage_pte_leaf(ptep)) { + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); + next_ptep_level = ptep_level - 1; + ret = gstage_level_to_page_size(next_ptep_level, &next_page_size); + if (ret) + return; + + if (op == GSTAGE_OP_CLEAR) + set_pte(ptep, __pte(0)); + for (i = 0; i < PTRS_PER_PTE; i++) + kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size, + &next_ptep[i], next_ptep_level, op); + if (op == GSTAGE_OP_CLEAR) + put_page(virt_to_page(next_ptep)); + } else { + old_pte = *ptep; + if (op == GSTAGE_OP_CLEAR) + set_pte(ptep, __pte(0)); + else if (op == GSTAGE_OP_WP) + set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE)); + if (pte_val(*ptep) != pte_val(old_pte)) + gstage_tlb_flush(gstage, ptep_level, addr); + } +} + +void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage, + gpa_t start, gpa_t size, bool may_block) +{ + int ret; + pte_t *ptep; + u32 ptep_level; + bool found_leaf; + unsigned long page_size; + gpa_t addr = start, end = start + size; + + while (addr < end) { + found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level); + ret = gstage_level_to_page_size(ptep_level, &page_size); + if (ret) + break; + + if (!found_leaf) + goto next; + + if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) + kvm_riscv_gstage_op_pte(gstage, addr, ptep, + ptep_level, GSTAGE_OP_CLEAR); + +next: + addr += page_size; + + /* + * If the range is too large, release the kvm->mmu_lock + * to prevent starvation and lockup detector warnings. + */ + if (!(gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) && may_block && addr < end) + cond_resched_lock(&gstage->kvm->mmu_lock); + } +} + +void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end) +{ + int ret; + pte_t *ptep; + u32 ptep_level; + bool found_leaf; + gpa_t addr = start; + unsigned long page_size; + + while (addr < end) { + found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level); + ret = gstage_level_to_page_size(ptep_level, &page_size); + if (ret) + break; + + if (!found_leaf) + goto next; + + if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) + kvm_riscv_gstage_op_pte(gstage, addr, ptep, + ptep_level, GSTAGE_OP_WP); + +next: + addr += page_size; + } +} + +void __init kvm_riscv_gstage_mode_detect(void) +{ +#ifdef CONFIG_64BIT + /* Try Sv57x4 G-stage mode */ + csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); + if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { + kvm_riscv_gstage_mode = HGATP_MODE_SV57X4; + kvm_riscv_gstage_pgd_levels = 5; + goto done; + } + + /* Try Sv48x4 G-stage mode */ + csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); + if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { + kvm_riscv_gstage_mode = HGATP_MODE_SV48X4; + kvm_riscv_gstage_pgd_levels = 4; + goto done; + } + + /* Try Sv39x4 G-stage mode */ + csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); + if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) { + kvm_riscv_gstage_mode = HGATP_MODE_SV39X4; + kvm_riscv_gstage_pgd_levels = 3; + goto done; + } +#else /* CONFIG_32BIT */ + /* Try Sv32x4 G-stage mode */ + csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); + if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) { + kvm_riscv_gstage_mode = HGATP_MODE_SV32X4; + kvm_riscv_gstage_pgd_levels = 2; + goto done; + } +#endif + + /* KVM depends on !HGATP_MODE_OFF */ + kvm_riscv_gstage_mode = HGATP_MODE_OFF; + kvm_riscv_gstage_pgd_levels = 0; + +done: + csr_write(CSR_HGATP, 0); + kvm_riscv_local_hfence_gvma_all(); +} diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 58c5489d3031..45536af521f0 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -10,54 +10,54 @@ #include <linux/err.h> #include <linux/module.h> #include <linux/kvm_host.h> -#include <asm/csr.h> -#include <asm/hwcap.h> +#include <asm/cpufeature.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nacl.h> #include <asm/sbi.h> -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - return -EINVAL; -} +DEFINE_STATIC_KEY_FALSE(kvm_riscv_vsstage_tlb_no_gpa); -int kvm_arch_check_processor_compat(void *opaque) +static void kvm_riscv_setup_vendor_features(void) { - return 0; + /* Andes AX66: split two-stage TLBs */ + if (riscv_cached_mvendorid(0) == ANDES_VENDOR_ID && + (riscv_cached_marchid(0) & 0xFFFF) == 0x8A66) { + static_branch_enable(&kvm_riscv_vsstage_tlb_no_gpa); + kvm_info("VS-stage TLB does not cache guest physical address and VMID\n"); + } } -int kvm_arch_hardware_setup(void *opaque) +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) { - return 0; + return -EINVAL; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { - unsigned long hideleg, hedeleg; + int rc; - hedeleg = 0; - hedeleg |= (1UL << EXC_INST_MISALIGNED); - hedeleg |= (1UL << EXC_BREAKPOINT); - hedeleg |= (1UL << EXC_SYSCALL); - hedeleg |= (1UL << EXC_INST_PAGE_FAULT); - hedeleg |= (1UL << EXC_LOAD_PAGE_FAULT); - hedeleg |= (1UL << EXC_STORE_PAGE_FAULT); - csr_write(CSR_HEDELEG, hedeleg); + rc = kvm_riscv_nacl_enable(); + if (rc) + return rc; - hideleg = 0; - hideleg |= (1UL << IRQ_VS_SOFT); - hideleg |= (1UL << IRQ_VS_TIMER); - hideleg |= (1UL << IRQ_VS_EXT); - csr_write(CSR_HIDELEG, hideleg); + csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); + csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); - csr_write(CSR_HCOUNTEREN, -1UL); + /* VS should access only the time counter directly. Everything else should trap */ + csr_write(CSR_HCOUNTEREN, 0x02); csr_write(CSR_HVIP, 0); + kvm_riscv_aia_enable(); + return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { + kvm_riscv_aia_disable(); + /* * After clearing the hideleg CSR, the host kernel will receive * spurious interrupts if hvip CSR has pending interrupts and the @@ -68,10 +68,21 @@ void kvm_arch_hardware_disable(void) csr_write(CSR_HVIP, 0); csr_write(CSR_HEDELEG, 0); csr_write(CSR_HIDELEG, 0); + + kvm_riscv_nacl_disable(); } -int kvm_arch_init(void *opaque) +static void kvm_riscv_teardown(void) { + kvm_riscv_aia_exit(); + kvm_riscv_nacl_exit(); + kvm_unregister_perf_callbacks(); +} + +static int __init riscv_kvm_init(void) +{ + int rc; + char slist[64]; const char *str; if (!riscv_isa_extension_available(NULL, h)) { @@ -84,18 +95,17 @@ int kvm_arch_init(void *opaque) return -ENODEV; } - if (sbi_probe_extension(SBI_EXT_RFENCE) <= 0) { + if (!sbi_probe_extension(SBI_EXT_RFENCE)) { kvm_info("require SBI RFENCE extension\n"); return -ENODEV; } - kvm_riscv_gstage_mode_detect(); - - kvm_riscv_gstage_vmid_detect(); - - kvm_info("hypervisor extension available\n"); + rc = kvm_riscv_nacl_init(); + if (rc && rc != -ENODEV) + return rc; - switch (kvm_riscv_gstage_mode()) { + kvm_riscv_gstage_mode_detect(); + switch (kvm_riscv_gstage_mode) { case HGATP_MODE_SV32X4: str = "Sv32x4"; break; @@ -109,27 +119,77 @@ int kvm_arch_init(void *opaque) str = "Sv57x4"; break; default: + kvm_riscv_nacl_exit(); return -ENODEV; } + + kvm_riscv_gstage_vmid_detect(); + + rc = kvm_riscv_aia_init(); + if (rc && rc != -ENODEV) { + kvm_riscv_nacl_exit(); + return rc; + } + + kvm_info("hypervisor extension available\n"); + + if (kvm_riscv_nacl_available()) { + rc = 0; + slist[0] = '\0'; + if (kvm_riscv_nacl_sync_csr_available()) { + if (rc) + strcat(slist, ", "); + strcat(slist, "sync_csr"); + rc++; + } + if (kvm_riscv_nacl_sync_hfence_available()) { + if (rc) + strcat(slist, ", "); + strcat(slist, "sync_hfence"); + rc++; + } + if (kvm_riscv_nacl_sync_sret_available()) { + if (rc) + strcat(slist, ", "); + strcat(slist, "sync_sret"); + rc++; + } + if (kvm_riscv_nacl_autoswap_csr_available()) { + if (rc) + strcat(slist, ", "); + strcat(slist, "autoswap_csr"); + rc++; + } + kvm_info("using SBI nested acceleration with %s\n", + (rc) ? slist : "no features"); + } + kvm_info("using %s G-stage page table format\n", str); kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); - return 0; -} + if (kvm_riscv_aia_available()) + kvm_info("AIA available with %d guest external interrupts\n", + kvm_riscv_aia_nr_hgei); -void kvm_arch_exit(void) -{ -} + kvm_riscv_setup_vendor_features(); -static int __init riscv_kvm_init(void) -{ - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + kvm_register_perf_callbacks(NULL); + + rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (rc) { + kvm_riscv_teardown(); + return rc; + } + + return 0; } module_init(riscv_kvm_init); static void __exit riscv_kvm_exit(void) { kvm_exit(); + + kvm_riscv_teardown(); } module_exit(riscv_kvm_exit); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 34b57e0be2ef..4ab06697bfc0 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -6,371 +6,73 @@ * Anup Patel <anup.patel@wdc.com> */ -#include <linux/bitops.h> #include <linux/errno.h> -#include <linux/err.h> #include <linux/hugetlb.h> #include <linux/module.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/kvm_host.h> #include <linux/sched/signal.h> -#include <asm/csr.h> -#include <asm/page.h> -#include <asm/pgtable.h> - -#ifdef CONFIG_64BIT -static unsigned long gstage_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); -static unsigned long gstage_pgd_levels = 3; -#define gstage_index_bits 9 -#else -static unsigned long gstage_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); -static unsigned long gstage_pgd_levels = 2; -#define gstage_index_bits 10 -#endif - -#define gstage_pgd_xbits 2 -#define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits)) -#define gstage_gpa_bits (HGATP_PAGE_SHIFT + \ - (gstage_pgd_levels * gstage_index_bits) + \ - gstage_pgd_xbits) -#define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits)) - -#define gstage_pte_leaf(__ptep) \ - (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) - -static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) -{ - unsigned long mask; - unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level); - - if (level == (gstage_pgd_levels - 1)) - mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1; - else - mask = PTRS_PER_PTE - 1; - - return (addr >> shift) & mask; -} - -static inline unsigned long gstage_pte_page_vaddr(pte_t pte) -{ - return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte))); -} - -static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) -{ - u32 i; - unsigned long psz = 1UL << 12; - - for (i = 0; i < gstage_pgd_levels; i++) { - if (page_size == (psz << (i * gstage_index_bits))) { - *out_level = i; - return 0; - } - } - - return -EINVAL; -} - -static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder) -{ - if (gstage_pgd_levels < level) - return -EINVAL; - - *out_pgorder = 12 + (level * gstage_index_bits); - return 0; -} - -static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) -{ - int rc; - unsigned long page_order = PAGE_SHIFT; - - rc = gstage_level_to_page_order(level, &page_order); - if (rc) - return rc; - - *out_pgsize = BIT(page_order); - return 0; -} - -static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr, - pte_t **ptepp, u32 *ptep_level) -{ - pte_t *ptep; - u32 current_level = gstage_pgd_levels - 1; - - *ptep_level = current_level; - ptep = (pte_t *)kvm->arch.pgd; - ptep = &ptep[gstage_pte_index(addr, current_level)]; - while (ptep && pte_val(*ptep)) { - if (gstage_pte_leaf(ptep)) { - *ptep_level = current_level; - *ptepp = ptep; - return true; - } - - if (current_level) { - current_level--; - *ptep_level = current_level; - ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); - ptep = &ptep[gstage_pte_index(addr, current_level)]; - } else { - ptep = NULL; - } - } - - return false; -} - -static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) -{ - unsigned long order = PAGE_SHIFT; - - if (gstage_level_to_page_order(level, &order)) - return; - addr &= ~(BIT(order) - 1); - - kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order); -} - -static int gstage_set_pte(struct kvm *kvm, u32 level, - struct kvm_mmu_memory_cache *pcache, - gpa_t addr, const pte_t *new_pte) -{ - u32 current_level = gstage_pgd_levels - 1; - pte_t *next_ptep = (pte_t *)kvm->arch.pgd; - pte_t *ptep = &next_ptep[gstage_pte_index(addr, current_level)]; - - if (current_level < level) - return -EINVAL; - - while (current_level != level) { - if (gstage_pte_leaf(ptep)) - return -EEXIST; - - if (!pte_val(*ptep)) { - if (!pcache) - return -ENOMEM; - next_ptep = kvm_mmu_memory_cache_alloc(pcache); - if (!next_ptep) - return -ENOMEM; - *ptep = pfn_pte(PFN_DOWN(__pa(next_ptep)), - __pgprot(_PAGE_TABLE)); - } else { - if (gstage_pte_leaf(ptep)) - return -EEXIST; - next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); - } - - current_level--; - ptep = &next_ptep[gstage_pte_index(addr, current_level)]; - } - - *ptep = *new_pte; - if (gstage_pte_leaf(ptep)) - gstage_remote_tlb_flush(kvm, current_level, addr); - - return 0; -} - -static int gstage_map_page(struct kvm *kvm, - struct kvm_mmu_memory_cache *pcache, - gpa_t gpa, phys_addr_t hpa, - unsigned long page_size, - bool page_rdonly, bool page_exec) -{ - int ret; - u32 level = 0; - pte_t new_pte; - pgprot_t prot; - - ret = gstage_page_size_to_level(page_size, &level); - if (ret) - return ret; - - /* - * A RISC-V implementation can choose to either: - * 1) Update 'A' and 'D' PTE bits in hardware - * 2) Generate page fault when 'A' and/or 'D' bits are not set - * PTE so that software can update these bits. - * - * We support both options mentioned above. To achieve this, we - * always set 'A' and 'D' PTE bits at time of creating G-stage - * mapping. To support KVM dirty page logging with both options - * mentioned above, we will write-protect G-stage PTEs to track - * dirty pages. - */ - - if (page_exec) { - if (page_rdonly) - prot = PAGE_READ_EXEC; - else - prot = PAGE_WRITE_EXEC; - } else { - if (page_rdonly) - prot = PAGE_READ; - else - prot = PAGE_WRITE; - } - new_pte = pfn_pte(PFN_DOWN(hpa), prot); - new_pte = pte_mkdirty(new_pte); - - return gstage_set_pte(kvm, level, pcache, gpa, &new_pte); -} - -enum gstage_op { - GSTAGE_OP_NOP = 0, /* Nothing */ - GSTAGE_OP_CLEAR, /* Clear/Unmap */ - GSTAGE_OP_WP, /* Write-protect */ -}; - -static void gstage_op_pte(struct kvm *kvm, gpa_t addr, - pte_t *ptep, u32 ptep_level, enum gstage_op op) -{ - int i, ret; - pte_t *next_ptep; - u32 next_ptep_level; - unsigned long next_page_size, page_size; - - ret = gstage_level_to_page_size(ptep_level, &page_size); - if (ret) - return; - - BUG_ON(addr & (page_size - 1)); - - if (!pte_val(*ptep)) - return; - - if (ptep_level && !gstage_pte_leaf(ptep)) { - next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); - next_ptep_level = ptep_level - 1; - ret = gstage_level_to_page_size(next_ptep_level, - &next_page_size); - if (ret) - return; - - if (op == GSTAGE_OP_CLEAR) - set_pte(ptep, __pte(0)); - for (i = 0; i < PTRS_PER_PTE; i++) - gstage_op_pte(kvm, addr + i * next_page_size, - &next_ptep[i], next_ptep_level, op); - if (op == GSTAGE_OP_CLEAR) - put_page(virt_to_page(next_ptep)); - } else { - if (op == GSTAGE_OP_CLEAR) - set_pte(ptep, __pte(0)); - else if (op == GSTAGE_OP_WP) - set_pte(ptep, __pte(pte_val(*ptep) & ~_PAGE_WRITE)); - gstage_remote_tlb_flush(kvm, ptep_level, addr); - } -} - -static void gstage_unmap_range(struct kvm *kvm, gpa_t start, - gpa_t size, bool may_block) -{ - int ret; - pte_t *ptep; - u32 ptep_level; - bool found_leaf; - unsigned long page_size; - gpa_t addr = start, end = start + size; - - while (addr < end) { - found_leaf = gstage_get_leaf_entry(kvm, addr, - &ptep, &ptep_level); - ret = gstage_level_to_page_size(ptep_level, &page_size); - if (ret) - break; - - if (!found_leaf) - goto next; - - if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - gstage_op_pte(kvm, addr, ptep, - ptep_level, GSTAGE_OP_CLEAR); - -next: - addr += page_size; - - /* - * If the range is too large, release the kvm->mmu_lock - * to prevent starvation and lockup detector warnings. - */ - if (may_block && addr < end) - cond_resched_lock(&kvm->mmu_lock); - } -} +#include <asm/kvm_mmu.h> +#include <asm/kvm_nacl.h> -static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) -{ - int ret; - pte_t *ptep; - u32 ptep_level; - bool found_leaf; - gpa_t addr = start; - unsigned long page_size; - - while (addr < end) { - found_leaf = gstage_get_leaf_entry(kvm, addr, - &ptep, &ptep_level); - ret = gstage_level_to_page_size(ptep_level, &page_size); - if (ret) - break; - - if (!found_leaf) - goto next; - - if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - gstage_op_pte(kvm, addr, ptep, - ptep_level, GSTAGE_OP_WP); - -next: - addr += page_size; - } -} - -static void gstage_wp_memory_region(struct kvm *kvm, int slot) +static void mmu_wp_memory_region(struct kvm *kvm, int slot) { struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; spin_lock(&kvm->mmu_lock); - gstage_wp_range(kvm, start, end); + kvm_riscv_gstage_wp_range(&gstage, start, end); spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_memslot(kvm, memslot); } -int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, - phys_addr_t hpa, unsigned long size, - bool writable, bool in_atomic) +int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, + unsigned long size, bool writable, bool in_atomic) { - pte_t pte; int ret = 0; + pgprot_t prot; unsigned long pfn; phys_addr_t addr, end; struct kvm_mmu_memory_cache pcache = { .gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0, .gfp_zero = __GFP_ZERO, }; + struct kvm_gstage_mapping map; + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(hpa); + prot = pgprot_noncached(PAGE_WRITE); for (addr = gpa; addr < end; addr += PAGE_SIZE) { - pte = pfn_pte(pfn, PAGE_KERNEL_IO); + map.addr = addr; + map.pte = pfn_pte(pfn, prot); + map.pte = pte_mkdirty(map.pte); + map.level = 0; if (!writable) - pte = pte_wrprotect(pte); + map.pte = pte_wrprotect(map.pte); - ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels); + ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels); if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = gstage_set_pte(kvm, 0, &pcache, addr, &pte); + ret = kvm_riscv_gstage_set_pte(&gstage, &pcache, &map); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -383,10 +85,17 @@ out: return ret; } -void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size) +void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size) { + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + spin_lock(&kvm->mmu_lock); - gstage_unmap_range(kvm, gpa, size, false); + kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false); spin_unlock(&kvm->mmu_lock); } @@ -398,18 +107,18 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, phys_addr_t base_gfn = slot->base_gfn + gfn_offset; phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + struct kvm_gstage gstage; - gstage_wp_range(kvm, start, end); -} + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; -void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) -{ + kvm_riscv_gstage_wp_range(&gstage, start, end); } -void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - const struct kvm_memory_slot *memslot) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - kvm_flush_remote_tlbs(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free) @@ -422,7 +131,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_riscv_gstage_free_pgd(kvm); + kvm_riscv_mmu_free_pgd(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -430,9 +139,15 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, { gpa_t gpa = slot->base_gfn << PAGE_SHIFT; phys_addr_t size = slot->npages << PAGE_SHIFT; + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; spin_lock(&kvm->mmu_lock); - gstage_unmap_range(kvm, gpa, size, false); + kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false); spin_unlock(&kvm->mmu_lock); } @@ -446,8 +161,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * allocated dirty_bitmap[], dirty pages will be tracked while * the memory slot is write protected. */ - if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) - gstage_wp_memory_region(kvm, new->id); + if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + return; + mmu_wp_memory_region(kvm, new->id); + } } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -456,7 +174,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, enum kvm_mr_change change) { hva_t hva, reg_end, size; - gpa_t base_gpa; bool writable; int ret = 0; @@ -469,21 +186,19 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * space addressable by the KVM guest GPA space. */ if ((new->base_gfn + new->npages) >= - (gstage_gpa_size >> PAGE_SHIFT)) + (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT)) return -EFAULT; hva = new->userspace_addr; size = new->npages << PAGE_SHIFT; reg_end = hva + size; - base_gpa = new->base_gfn << PAGE_SHIFT; writable = !(new->flags & KVM_MEM_READONLY); mmap_read_lock(current->mm); /* * A memory region could potentially cover multiple VMAs, and - * any holes between them, so iterate over all of them to find - * out if we can map any of them right now. + * any holes between them, so iterate over all of them. * * +--------------------------------------------+ * +---------------+----------------+ +----------------+ @@ -493,10 +208,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * +--------------------------------------------+ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); - hva_t vm_start, vm_end; + struct vm_area_struct *vma; + hva_t vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) break; /* @@ -509,37 +225,18 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, } /* Take the intersection of this VMA with the memory region */ - vm_start = max(hva, vma->vm_start); vm_end = min(reg_end, vma->vm_end); if (vma->vm_flags & VM_PFNMAP) { - gpa_t gpa = base_gpa + (vm_start - hva); - phys_addr_t pa; - - pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; - pa += vm_start - vma->vm_start; - /* IO region dirty page logging not allowed */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { ret = -EINVAL; goto out; } - - ret = kvm_riscv_gstage_ioremap(kvm, gpa, pa, - vm_end - vm_start, - writable, false); - if (ret) - break; } hva = vm_end; } while (hva < reg_end); - if (change == KVM_MR_FLAGS_ONLY) - goto out; - - if (ret) - kvm_riscv_gstage_iounmap(kvm, base_gpa, size); - out: mmap_read_unlock(current->mm); return ret; @@ -547,32 +244,18 @@ out: bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - if (!kvm->arch.pgd) - return false; - - gstage_unmap_range(kvm, range->start << PAGE_SHIFT, - (range->end - range->start) << PAGE_SHIFT, - range->may_block); - return false; -} - -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - int ret; - kvm_pfn_t pfn = pte_pfn(range->pte); + struct kvm_gstage gstage; if (!kvm->arch.pgd) return false; - WARN_ON(range->end - range->start != 1); - - ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT, - __pfn_to_phys(pfn), PAGE_SIZE, true, true); - if (ret) { - kvm_debug("Failed to map G-stage page (error %d)\n", ret); - return true; - } - + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT, + (range->end - range->start) << PAGE_SHIFT, + range->may_block); return false; } @@ -581,14 +264,19 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) pte_t *ptep; u32 ptep_level = 0; u64 size = (range->end - range->start) << PAGE_SHIFT; + struct kvm_gstage gstage; if (!kvm->arch.pgd) return false; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, - &ptep, &ptep_level)) + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) return false; return ptep_test_and_clear_young(NULL, 0, ptep); @@ -599,22 +287,27 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) pte_t *ptep; u32 ptep_level = 0; u64 size = (range->end - range->start) << PAGE_SHIFT; + struct kvm_gstage gstage; if (!kvm->arch.pgd) return false; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, - &ptep, &ptep_level)) + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) return false; - return pte_young(*ptep); + return pte_young(ptep_get(ptep)); } -int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, - struct kvm_memory_slot *memslot, - gpa_t gpa, unsigned long hva, bool is_write) +int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, + gpa_t gpa, unsigned long hva, bool is_write, + struct kvm_gstage_mapping *out_map) { int ret; kvm_pfn_t hfn; @@ -627,6 +320,23 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, bool logging = (memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY)) ? true : false; unsigned long vma_pagesize, mmu_seq; + struct kvm_gstage gstage; + struct page *page; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + + /* Setup initial state of output mapping */ + memset(out_map, 0, sizeof(*out_map)); + + /* We need minimum second+third level pages */ + ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels); + if (ret) { + kvm_err("Failed to topup G-stage cache\n"); + return ret; + } mmap_read_lock(current->mm); @@ -645,28 +355,29 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, if (logging || (vma->vm_flags & VM_PFNMAP)) vma_pagesize = PAGE_SIZE; - if (vma_pagesize == PMD_SIZE || vma_pagesize == PGDIR_SIZE) + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; + /* + * Read mmu_invalidate_seq so that KVM can detect if the results of + * vma_lookup() or __kvm_faultin_pfn() become stale prior to acquiring + * kvm->mmu_lock. + * + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). + */ + mmu_seq = kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - if (vma_pagesize != PGDIR_SIZE && + if (vma_pagesize != PUD_SIZE && vma_pagesize != PMD_SIZE && vma_pagesize != PAGE_SIZE) { kvm_err("Invalid VMA page size 0x%lx\n", vma_pagesize); return -EFAULT; } - /* We need minimum second+third level pages */ - ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels); - if (ret) { - kvm_err("Failed to topup G-stage cache\n"); - return ret; - } - - mmu_seq = kvm->mmu_invalidate_seq; - - hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable); + hfn = __kvm_faultin_pfn(memslot, gfn, is_write ? FOLL_WRITE : 0, + &writable, &page); if (hfn == KVM_PFN_ERR_HWPOISON) { send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, vma_pageshift, current); @@ -688,26 +399,24 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, goto out_unlock; if (writable) { - kvm_set_pfn_dirty(hfn); - mark_page_dirty(kvm, gfn); - ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, - vma_pagesize, false, true); + mark_page_dirty_in_slot(kvm, memslot, gfn); + ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT, + vma_pagesize, false, true, out_map); } else { - ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, - vma_pagesize, true, true); + ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT, + vma_pagesize, true, true, out_map); } if (ret) kvm_err("Failed to map in G-stage\n"); out_unlock: + kvm_release_faultin_page(kvm, page, ret && ret != -EEXIST, writable); spin_unlock(&kvm->mmu_lock); - kvm_set_pfn_accessed(hfn); - kvm_release_pfn_clean(hfn); return ret; } -int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) +int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm) { struct page *pgd_page; @@ -717,7 +426,7 @@ int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) } pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, - get_order(gstage_pgd_size)); + get_order(kvm_riscv_gstage_pgd_size)); if (!pgd_page) return -ENOMEM; kvm->arch.pgd = page_to_virt(pgd_page); @@ -726,13 +435,18 @@ int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) return 0; } -void kvm_riscv_gstage_free_pgd(struct kvm *kvm) +void kvm_riscv_mmu_free_pgd(struct kvm *kvm) { + struct kvm_gstage gstage; void *pgd = NULL; spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { - gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false); + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; kvm->arch.pgd_phys = 0; @@ -740,54 +454,19 @@ void kvm_riscv_gstage_free_pgd(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); if (pgd) - free_pages((unsigned long)pgd, get_order(gstage_pgd_size)); + free_pages((unsigned long)pgd, get_order(kvm_riscv_gstage_pgd_size)); } -void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu) +void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu) { - unsigned long hgatp = gstage_mode; + unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT; struct kvm_arch *k = &vcpu->kvm->arch; - hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & - HGATP_VMID_MASK; + hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID; hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN; - csr_write(CSR_HGATP, hgatp); + ncsr_write(CSR_HGATP, hgatp); if (!kvm_riscv_gstage_vmid_bits()) kvm_riscv_local_hfence_gvma_all(); } - -void kvm_riscv_gstage_mode_detect(void) -{ -#ifdef CONFIG_64BIT - /* Try Sv57x4 G-stage mode */ - csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); - if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { - gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); - gstage_pgd_levels = 5; - goto skip_sv48x4_test; - } - - /* Try Sv48x4 G-stage mode */ - csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); - if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { - gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); - gstage_pgd_levels = 4; - } -skip_sv48x4_test: - - csr_write(CSR_HGATP, 0); - kvm_riscv_local_hfence_gvma_all(); -#endif -} - -unsigned long kvm_riscv_gstage_mode(void) -{ - return gstage_mode >> HGATP_MODE_SHIFT; -} - -int kvm_riscv_gstage_gpa_bits(void) -{ - return gstage_gpa_bits; -} diff --git a/arch/riscv/kvm/nacl.c b/arch/riscv/kvm/nacl.c new file mode 100644 index 000000000000..08a95ad9ada2 --- /dev/null +++ b/arch/riscv/kvm/nacl.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024 Ventana Micro Systems Inc. + */ + +#include <linux/kvm_host.h> +#include <linux/vmalloc.h> +#include <asm/kvm_nacl.h> + +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_available); +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_sync_csr_available); +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_sync_hfence_available); +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_sync_sret_available); +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_autoswap_csr_available); +DEFINE_PER_CPU(struct kvm_riscv_nacl, kvm_riscv_nacl); + +void __kvm_riscv_nacl_hfence(void *shmem, + unsigned long control, + unsigned long page_num, + unsigned long page_count) +{ + int i, ent = -1, try_count = 5; + unsigned long *entp; + +again: + for (i = 0; i < SBI_NACL_SHMEM_HFENCE_ENTRY_MAX; i++) { + entp = shmem + SBI_NACL_SHMEM_HFENCE_ENTRY_CONFIG(i); + if (lelong_to_cpu(*entp) & SBI_NACL_SHMEM_HFENCE_CONFIG_PEND) + continue; + + ent = i; + break; + } + + if (ent < 0) { + if (try_count) { + nacl_sync_hfence(-1UL); + goto again; + } else { + pr_warn("KVM: No free entry in NACL shared memory\n"); + return; + } + } + + entp = shmem + SBI_NACL_SHMEM_HFENCE_ENTRY_CONFIG(i); + *entp = cpu_to_lelong(control); + entp = shmem + SBI_NACL_SHMEM_HFENCE_ENTRY_PNUM(i); + *entp = cpu_to_lelong(page_num); + entp = shmem + SBI_NACL_SHMEM_HFENCE_ENTRY_PCOUNT(i); + *entp = cpu_to_lelong(page_count); +} + +int kvm_riscv_nacl_enable(void) +{ + int rc; + struct sbiret ret; + struct kvm_riscv_nacl *nacl; + + if (!kvm_riscv_nacl_available()) + return 0; + nacl = this_cpu_ptr(&kvm_riscv_nacl); + + ret = sbi_ecall(SBI_EXT_NACL, SBI_EXT_NACL_SET_SHMEM, + nacl->shmem_phys, 0, 0, 0, 0, 0); + rc = sbi_err_map_linux_errno(ret.error); + if (rc) + return rc; + + return 0; +} + +void kvm_riscv_nacl_disable(void) +{ + if (!kvm_riscv_nacl_available()) + return; + + sbi_ecall(SBI_EXT_NACL, SBI_EXT_NACL_SET_SHMEM, + SBI_SHMEM_DISABLE, SBI_SHMEM_DISABLE, 0, 0, 0, 0); +} + +void kvm_riscv_nacl_exit(void) +{ + int cpu; + struct kvm_riscv_nacl *nacl; + + if (!kvm_riscv_nacl_available()) + return; + + /* Allocate per-CPU shared memory */ + for_each_possible_cpu(cpu) { + nacl = per_cpu_ptr(&kvm_riscv_nacl, cpu); + if (!nacl->shmem) + continue; + + free_pages((unsigned long)nacl->shmem, + get_order(SBI_NACL_SHMEM_SIZE)); + nacl->shmem = NULL; + nacl->shmem_phys = 0; + } +} + +static long nacl_probe_feature(long feature_id) +{ + struct sbiret ret; + + if (!kvm_riscv_nacl_available()) + return 0; + + ret = sbi_ecall(SBI_EXT_NACL, SBI_EXT_NACL_PROBE_FEATURE, + feature_id, 0, 0, 0, 0, 0); + return ret.value; +} + +int kvm_riscv_nacl_init(void) +{ + int cpu; + struct page *shmem_page; + struct kvm_riscv_nacl *nacl; + + if (sbi_spec_version < sbi_mk_version(1, 0) || + sbi_probe_extension(SBI_EXT_NACL) <= 0) + return -ENODEV; + + /* Enable NACL support */ + static_branch_enable(&kvm_riscv_nacl_available); + + /* Probe NACL features */ + if (nacl_probe_feature(SBI_NACL_FEAT_SYNC_CSR)) + static_branch_enable(&kvm_riscv_nacl_sync_csr_available); + if (nacl_probe_feature(SBI_NACL_FEAT_SYNC_HFENCE)) + static_branch_enable(&kvm_riscv_nacl_sync_hfence_available); + if (nacl_probe_feature(SBI_NACL_FEAT_SYNC_SRET)) + static_branch_enable(&kvm_riscv_nacl_sync_sret_available); + if (nacl_probe_feature(SBI_NACL_FEAT_AUTOSWAP_CSR)) + static_branch_enable(&kvm_riscv_nacl_autoswap_csr_available); + + /* Allocate per-CPU shared memory */ + for_each_possible_cpu(cpu) { + nacl = per_cpu_ptr(&kvm_riscv_nacl, cpu); + + shmem_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(SBI_NACL_SHMEM_SIZE)); + if (!shmem_page) { + kvm_riscv_nacl_exit(); + return -ENOMEM; + } + nacl->shmem = page_to_virt(shmem_page); + nacl->shmem_phys = page_to_phys(shmem_page); + } + + return 0; +} diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 309d79b3e5cd..ff1aeac4eb8e 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -12,11 +12,13 @@ #include <linux/kvm_host.h> #include <asm/cacheflush.h> #include <asm/csr.h> -#include <asm/hwcap.h> +#include <asm/cpufeature.h> #include <asm/insn-def.h> +#include <asm/kvm_nacl.h> +#include <asm/kvm_tlb.h> +#include <asm/kvm_vmid.h> -#define has_svinval() \ - static_branch_unlikely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_SVINVAL]) +#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL) void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, gpa_t gpa, gpa_t gpsz, @@ -177,27 +179,41 @@ void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu) vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); kvm_riscv_local_hfence_gvma_vmid_all(vmid); + + /* + * Flush VS-stage TLB entries for implementation where VS-stage + * TLB does not cahce guest physical address and VMID. + */ + if (static_branch_unlikely(&kvm_riscv_vsstage_tlb_no_gpa)) + kvm_riscv_local_hfence_vvma_all(vmid); } void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu) { + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_RCVD); local_flush_icache_all(); } -void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu) +void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu) { - struct kvm_vmid *vmid; + struct kvm_vmid *v = &vcpu->kvm->arch.vmid; + unsigned long vmid = READ_ONCE(v->vmid); - vmid = &vcpu->kvm->arch.vmid; - kvm_riscv_local_hfence_gvma_vmid_all(READ_ONCE(vmid->vmid)); + if (kvm_riscv_nacl_available()) + nacl_hfence_gvma_vmid_all(nacl_shmem(), vmid); + else + kvm_riscv_local_hfence_gvma_vmid_all(vmid); } void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu) { - struct kvm_vmid *vmid; + struct kvm_vmid *v = &vcpu->kvm->arch.vmid; + unsigned long vmid = READ_ONCE(v->vmid); - vmid = &vcpu->kvm->arch.vmid; - kvm_riscv_local_hfence_vvma_all(READ_ONCE(vmid->vmid)); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma_all(nacl_shmem(), vmid); + else + kvm_riscv_local_hfence_vvma_all(vmid); } static bool vcpu_hfence_dequeue(struct kvm_vcpu *vcpu, @@ -252,30 +268,56 @@ static bool vcpu_hfence_enqueue(struct kvm_vcpu *vcpu, void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu) { struct kvm_riscv_hfence d = { 0 }; - struct kvm_vmid *v = &vcpu->kvm->arch.vmid; while (vcpu_hfence_dequeue(vcpu, &d)) { switch (d.type) { case KVM_RISCV_HFENCE_UNKNOWN: break; case KVM_RISCV_HFENCE_GVMA_VMID_GPA: - kvm_riscv_local_hfence_gvma_vmid_gpa( - READ_ONCE(v->vmid), - d.addr, d.size, d.order); + if (kvm_riscv_nacl_available()) + nacl_hfence_gvma_vmid(nacl_shmem(), d.vmid, + d.addr, d.size, d.order); + else + kvm_riscv_local_hfence_gvma_vmid_gpa(d.vmid, d.addr, + d.size, d.order); + break; + case KVM_RISCV_HFENCE_GVMA_VMID_ALL: + if (kvm_riscv_nacl_available()) + nacl_hfence_gvma_vmid_all(nacl_shmem(), d.vmid); + else + kvm_riscv_local_hfence_gvma_vmid_all(d.vmid); break; case KVM_RISCV_HFENCE_VVMA_ASID_GVA: - kvm_riscv_local_hfence_vvma_asid_gva( - READ_ONCE(v->vmid), d.asid, - d.addr, d.size, d.order); + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma_asid(nacl_shmem(), d.vmid, d.asid, + d.addr, d.size, d.order); + else + kvm_riscv_local_hfence_vvma_asid_gva(d.vmid, d.asid, d.addr, + d.size, d.order); break; case KVM_RISCV_HFENCE_VVMA_ASID_ALL: - kvm_riscv_local_hfence_vvma_asid_all( - READ_ONCE(v->vmid), d.asid); + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma_asid_all(nacl_shmem(), d.vmid, d.asid); + else + kvm_riscv_local_hfence_vvma_asid_all(d.vmid, d.asid); break; case KVM_RISCV_HFENCE_VVMA_GVA: - kvm_riscv_local_hfence_vvma_gva( - READ_ONCE(v->vmid), - d.addr, d.size, d.order); + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma(nacl_shmem(), d.vmid, + d.addr, d.size, d.order); + else + kvm_riscv_local_hfence_vvma_gva(d.vmid, d.addr, + d.size, d.order); + break; + case KVM_RISCV_HFENCE_VVMA_ALL: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma_all(nacl_shmem(), d.vmid); + else + kvm_riscv_local_hfence_vvma_all(d.vmid); break; default: break; @@ -293,7 +335,7 @@ static void make_xfence_request(struct kvm *kvm, unsigned int actual_req = req; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); - bitmap_clear(vcpu_mask, 0, KVM_MAX_VCPUS); + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (hbase != -1UL) { if (vcpu->vcpu_id < hbase) @@ -329,35 +371,43 @@ void kvm_riscv_fence_i(struct kvm *kvm, void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, unsigned long hbase, unsigned long hmask, gpa_t gpa, gpa_t gpsz, - unsigned long order) + unsigned long order, unsigned long vmid) { struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_GVMA_VMID_GPA; data.asid = 0; + data.vmid = vmid; data.addr = gpa; data.size = gpsz; data.order = order; make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, - KVM_REQ_HFENCE_GVMA_VMID_ALL, &data); + KVM_REQ_TLB_FLUSH, &data); } void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, - unsigned long hbase, unsigned long hmask) + unsigned long hbase, unsigned long hmask, + unsigned long vmid) { - make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_GVMA_VMID_ALL, - KVM_REQ_HFENCE_GVMA_VMID_ALL, NULL); + struct kvm_riscv_hfence data = {0}; + + data.type = KVM_RISCV_HFENCE_GVMA_VMID_ALL; + data.vmid = vmid; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_TLB_FLUSH, &data); } void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, unsigned long hbase, unsigned long hmask, unsigned long gva, unsigned long gvsz, - unsigned long order, unsigned long asid) + unsigned long order, unsigned long asid, + unsigned long vmid) { struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_VVMA_ASID_GVA; data.asid = asid; + data.vmid = vmid; data.addr = gva; data.size = gvsz; data.order = order; @@ -367,13 +417,13 @@ void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, unsigned long hbase, unsigned long hmask, - unsigned long asid) + unsigned long asid, unsigned long vmid) { - struct kvm_riscv_hfence data; + struct kvm_riscv_hfence data = {0}; data.type = KVM_RISCV_HFENCE_VVMA_ASID_ALL; data.asid = asid; - data.addr = data.size = data.order = 0; + data.vmid = vmid; make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, KVM_REQ_HFENCE_VVMA_ALL, &data); } @@ -381,12 +431,13 @@ void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, unsigned long hbase, unsigned long hmask, unsigned long gva, unsigned long gvsz, - unsigned long order) + unsigned long order, unsigned long vmid) { struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_VVMA_GVA; data.asid = 0; + data.vmid = vmid; data.addr = gva; data.size = gvsz; data.order = order; @@ -395,8 +446,21 @@ void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, } void kvm_riscv_hfence_vvma_all(struct kvm *kvm, - unsigned long hbase, unsigned long hmask) + unsigned long hbase, unsigned long hmask, + unsigned long vmid) +{ + struct kvm_riscv_hfence data = {0}; + + data.type = KVM_RISCV_HFENCE_VVMA_ALL; + data.vmid = vmid; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_VVMA_ALL, &data); +} + +int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { - make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_VVMA_ALL, - KVM_REQ_HFENCE_VVMA_ALL, NULL); + kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, + gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT, + PAGE_SHIFT, READ_ONCE(kvm->arch.vmid.vmid)); + return 0; } diff --git a/arch/riscv/kvm/trace.h b/arch/riscv/kvm/trace.h new file mode 100644 index 000000000000..3d54175d805c --- /dev/null +++ b/arch/riscv/kvm/trace.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tracepoints for RISC-V KVM + * + * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd. + * + */ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +TRACE_EVENT(kvm_entry, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + + TP_STRUCT__entry( + __field(unsigned long, pc) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.guest_context.sepc; + ), + + TP_printk("PC: 0x016%lx", __entry->pc) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(struct kvm_cpu_trap *trap), + TP_ARGS(trap), + + TP_STRUCT__entry( + __field(unsigned long, sepc) + __field(unsigned long, scause) + __field(unsigned long, stval) + __field(unsigned long, htval) + __field(unsigned long, htinst) + ), + + TP_fast_assign( + __entry->sepc = trap->sepc; + __entry->scause = trap->scause; + __entry->stval = trap->stval; + __entry->htval = trap->htval; + __entry->htinst = trap->htinst; + ), + + TP_printk("SEPC:0x%lx, SCAUSE:0x%lx, STVAL:0x%lx, HTVAL:0x%lx, HTINST:0x%lx", + __entry->sepc, + __entry->scause, + __entry->stval, + __entry->htval, + __entry->htinst) +); + +#endif /* _TRACE_RSICV_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 7c08567097f0..a55a95da54d0 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -7,32 +7,39 @@ */ #include <linux/bitops.h> -#include <linux/entry-kvm.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/kdebug.h> #include <linux/module.h> #include <linux/percpu.h> -#include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/kvm_host.h> -#include <asm/csr.h> #include <asm/cacheflush.h> -#include <asm/hwcap.h> -#include <asm/sbi.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nacl.h> +#include <asm/kvm_vcpu_vector.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, ecall_exit_stat), STATS_DESC_COUNTER(VCPU, wfi_exit_stat), + STATS_DESC_COUNTER(VCPU, wrs_exit_stat), STATS_DESC_COUNTER(VCPU, mmio_exit_user), STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), STATS_DESC_COUNTER(VCPU, csr_exit_user), STATS_DESC_COUNTER(VCPU, csr_exit_kernel), STATS_DESC_COUNTER(VCPU, signal_exits), - STATS_DESC_COUNTER(VCPU, exits) + STATS_DESC_COUNTER(VCPU, exits), + STATS_DESC_COUNTER(VCPU, instr_illegal_exits), + STATS_DESC_COUNTER(VCPU, load_misaligned_exits), + STATS_DESC_COUNTER(VCPU, store_misaligned_exits), + STATS_DESC_COUNTER(VCPU, load_access_exits), + STATS_DESC_COUNTER(VCPU, store_access_exits), }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -44,75 +51,33 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -#define KVM_RISCV_BASE_ISA_MASK GENMASK(25, 0) - -#define KVM_ISA_EXT_ARR(ext) [KVM_RISCV_ISA_EXT_##ext] = RISCV_ISA_EXT_##ext - -/* Mapping between KVM ISA Extension ID & Host ISA extension ID */ -static const unsigned long kvm_isa_ext_arr[] = { - [KVM_RISCV_ISA_EXT_A] = RISCV_ISA_EXT_a, - [KVM_RISCV_ISA_EXT_C] = RISCV_ISA_EXT_c, - [KVM_RISCV_ISA_EXT_D] = RISCV_ISA_EXT_d, - [KVM_RISCV_ISA_EXT_F] = RISCV_ISA_EXT_f, - [KVM_RISCV_ISA_EXT_H] = RISCV_ISA_EXT_h, - [KVM_RISCV_ISA_EXT_I] = RISCV_ISA_EXT_i, - [KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m, - - KVM_ISA_EXT_ARR(SSTC), - KVM_ISA_EXT_ARR(SVINVAL), - KVM_ISA_EXT_ARR(SVPBMT), - KVM_ISA_EXT_ARR(ZIHINTPAUSE), - KVM_ISA_EXT_ARR(ZICBOM), -}; - -static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) +static void kvm_riscv_vcpu_context_reset(struct kvm_vcpu *vcpu, + bool kvm_sbi_reset) { - unsigned long i; - - for (i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) { - if (kvm_isa_ext_arr[i] == base_ext) - return i; - } + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + void *vector_datap = cntx->vector.datap; - return KVM_RISCV_ISA_EXT_MAX; -} + memset(cntx, 0, sizeof(*cntx)); + memset(csr, 0, sizeof(*csr)); + memset(&vcpu->arch.smstateen_csr, 0, sizeof(vcpu->arch.smstateen_csr)); -static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext) -{ - switch (ext) { - case KVM_RISCV_ISA_EXT_H: - return false; - default: - break; - } + /* Restore datap as it's not a part of the guest context. */ + cntx->vector.datap = vector_datap; - return true; -} + if (kvm_sbi_reset) + kvm_riscv_vcpu_sbi_load_reset_state(vcpu); -static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) -{ - switch (ext) { - case KVM_RISCV_ISA_EXT_A: - case KVM_RISCV_ISA_EXT_C: - case KVM_RISCV_ISA_EXT_I: - case KVM_RISCV_ISA_EXT_M: - case KVM_RISCV_ISA_EXT_SSTC: - case KVM_RISCV_ISA_EXT_SVINVAL: - case KVM_RISCV_ISA_EXT_ZIHINTPAUSE: - return false; - default: - break; - } + /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */ + cntx->sstatus = SR_SPP | SR_SPIE; - return true; + cntx->hstatus |= HSTATUS_VTW; + cntx->hstatus |= HSTATUS_SPVP; + cntx->hstatus |= HSTATUS_SPV; } -static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) +static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu, bool kvm_sbi_reset) { - struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; - struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; - struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; - struct kvm_cpu_context *reset_cntx = &vcpu->arch.guest_reset_context; bool loaded; /** @@ -127,21 +92,27 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.last_exit_cpu = -1; - memcpy(csr, reset_csr, sizeof(*csr)); - - memcpy(cntx, reset_cntx, sizeof(*cntx)); + kvm_riscv_vcpu_context_reset(vcpu, kvm_sbi_reset); kvm_riscv_vcpu_fp_reset(vcpu); + kvm_riscv_vcpu_vector_reset(vcpu); + kvm_riscv_vcpu_timer_reset(vcpu); - WRITE_ONCE(vcpu->arch.irqs_pending, 0); - WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0); + kvm_riscv_vcpu_aia_reset(vcpu); + + bitmap_zero(vcpu->arch.irqs_pending, KVM_RISCV_VCPU_NR_IRQS); + bitmap_zero(vcpu->arch.irqs_pending_mask, KVM_RISCV_VCPU_NR_IRQS); + + kvm_riscv_vcpu_pmu_reset(vcpu); vcpu->arch.hfence_head = 0; vcpu->arch.hfence_tail = 0; memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue)); + kvm_riscv_vcpu_sbi_reset(vcpu); + /* Reset the guest CSRs for hotplug usecase */ if (loaded) kvm_arch_vcpu_load(vcpu, smp_processor_id()); @@ -155,22 +126,19 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - struct kvm_cpu_context *cntx; - struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; - unsigned long host_isa, i; + int rc; + + spin_lock_init(&vcpu->arch.mp_state_lock); /* Mark this VCPU never ran */ vcpu->arch.ran_atleast_once = false; + + vcpu->arch.cfg.hedeleg = KVM_HEDELEG_DEFAULT; vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; bitmap_zero(vcpu->arch.isa, RISCV_ISA_EXT_MAX); /* Setup ISA features available to VCPU */ - for (i = 0; i < ARRAY_SIZE(kvm_isa_ext_arr); i++) { - host_isa = kvm_isa_ext_arr[i]; - if (__riscv_isa_extension_available(NULL, host_isa) && - kvm_riscv_vcpu_isa_enable_allowed(i)) - set_bit(host_isa, vcpu->arch.isa); - } + kvm_riscv_vcpu_setup_isa(vcpu); /* Setup vendor, arch, and implementation details */ vcpu->arch.mvendorid = sbi_get_mvendorid(); @@ -180,22 +148,29 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) /* Setup VCPU hfence queue */ spin_lock_init(&vcpu->arch.hfence_lock); - /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */ - cntx = &vcpu->arch.guest_reset_context; - cntx->sstatus = SR_SPP | SR_SPIE; - cntx->hstatus = 0; - cntx->hstatus |= HSTATUS_VTW; - cntx->hstatus |= HSTATUS_SPVP; - cntx->hstatus |= HSTATUS_SPV; + spin_lock_init(&vcpu->arch.reset_state.lock); - /* By default, make CY, TM, and IR counters accessible in VU mode */ - reset_csr->scounteren = 0x7; + rc = kvm_riscv_vcpu_alloc_vector_context(vcpu); + if (rc) + return rc; /* Setup VCPU timer */ kvm_riscv_vcpu_timer_init(vcpu); + /* setup performance monitoring */ + kvm_riscv_vcpu_pmu_init(vcpu); + + /* Setup VCPU AIA */ + kvm_riscv_vcpu_aia_init(vcpu); + + /* + * Setup SBI extensions + * NOTE: This must be the last thing to be initialized. + */ + kvm_riscv_vcpu_sbi_init(vcpu); + /* Reset VCPU */ - kvm_riscv_reset_vcpu(vcpu); + kvm_riscv_reset_vcpu(vcpu, false); return 0; } @@ -213,11 +188,21 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + kvm_riscv_vcpu_sbi_deinit(vcpu); + + /* Cleanup VCPU AIA context */ + kvm_riscv_vcpu_aia_deinit(vcpu); + /* Cleanup VCPU timer */ kvm_riscv_vcpu_timer_deinit(vcpu); + kvm_riscv_vcpu_pmu_deinit(vcpu); + /* Free unused pages pre-allocated for G-stage page table mappings */ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + + /* Free vector context space for host and guest kernel */ + kvm_riscv_vcpu_free_vector_context(vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) @@ -225,18 +210,10 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvm_riscv_vcpu_timer_pending(vcpu); } -void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) -{ -} - -void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) -{ -} - int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return (kvm_riscv_vcpu_has_interrupts(vcpu, -1UL) && - !vcpu->arch.power_off && !vcpu->arch.pause); + return (kvm_riscv_vcpu_has_interrupts(vcpu, -1ULL) && + !kvm_riscv_vcpu_stopped(vcpu) && !vcpu->arch.pause); } int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) @@ -249,387 +226,20 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return (vcpu->arch.guest_context.sstatus & SR_SPP) ? true : false; } -vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) -{ - return VM_FAULT_SIGBUS; -} - -static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) -{ - unsigned long __user *uaddr = - (unsigned long __user *)(unsigned long)reg->addr; - unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | - KVM_REG_SIZE_MASK | - KVM_REG_RISCV_CONFIG); - unsigned long reg_val; - - if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) - return -EINVAL; - - switch (reg_num) { - case KVM_REG_RISCV_CONFIG_REG(isa): - reg_val = vcpu->arch.isa[0] & KVM_RISCV_BASE_ISA_MASK; - break; - case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): - if (!riscv_isa_extension_available(vcpu->arch.isa, ZICBOM)) - return -EINVAL; - reg_val = riscv_cbom_block_size; - break; - case KVM_REG_RISCV_CONFIG_REG(mvendorid): - reg_val = vcpu->arch.mvendorid; - break; - case KVM_REG_RISCV_CONFIG_REG(marchid): - reg_val = vcpu->arch.marchid; - break; - case KVM_REG_RISCV_CONFIG_REG(mimpid): - reg_val = vcpu->arch.mimpid; - break; - default: - return -EINVAL; - } - - if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - return 0; -} - -static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) -{ - unsigned long __user *uaddr = - (unsigned long __user *)(unsigned long)reg->addr; - unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | - KVM_REG_SIZE_MASK | - KVM_REG_RISCV_CONFIG); - unsigned long i, isa_ext, reg_val; - - if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) - return -EINVAL; - - if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - switch (reg_num) { - case KVM_REG_RISCV_CONFIG_REG(isa): - /* - * This ONE REG interface is only defined for - * single letter extensions. - */ - if (fls(reg_val) >= RISCV_ISA_EXT_BASE) - return -EINVAL; - - if (!vcpu->arch.ran_atleast_once) { - /* Ignore the enable/disable request for certain extensions */ - for (i = 0; i < RISCV_ISA_EXT_BASE; i++) { - isa_ext = kvm_riscv_vcpu_base2isa_ext(i); - if (isa_ext >= KVM_RISCV_ISA_EXT_MAX) { - reg_val &= ~BIT(i); - continue; - } - if (!kvm_riscv_vcpu_isa_enable_allowed(isa_ext)) - if (reg_val & BIT(i)) - reg_val &= ~BIT(i); - if (!kvm_riscv_vcpu_isa_disable_allowed(isa_ext)) - if (!(reg_val & BIT(i))) - reg_val |= BIT(i); - } - reg_val &= riscv_isa_extension_base(NULL); - /* Do not modify anything beyond single letter extensions */ - reg_val = (vcpu->arch.isa[0] & ~KVM_RISCV_BASE_ISA_MASK) | - (reg_val & KVM_RISCV_BASE_ISA_MASK); - vcpu->arch.isa[0] = reg_val; - kvm_riscv_vcpu_fp_reset(vcpu); - } else { - return -EOPNOTSUPP; - } - break; - case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): - return -EOPNOTSUPP; - case KVM_REG_RISCV_CONFIG_REG(mvendorid): - if (!vcpu->arch.ran_atleast_once) - vcpu->arch.mvendorid = reg_val; - else - return -EBUSY; - break; - case KVM_REG_RISCV_CONFIG_REG(marchid): - if (!vcpu->arch.ran_atleast_once) - vcpu->arch.marchid = reg_val; - else - return -EBUSY; - break; - case KVM_REG_RISCV_CONFIG_REG(mimpid): - if (!vcpu->arch.ran_atleast_once) - vcpu->arch.mimpid = reg_val; - else - return -EBUSY; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int kvm_riscv_vcpu_get_reg_core(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) -{ - struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; - unsigned long __user *uaddr = - (unsigned long __user *)(unsigned long)reg->addr; - unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | - KVM_REG_SIZE_MASK | - KVM_REG_RISCV_CORE); - unsigned long reg_val; - - if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) - return -EINVAL; - if (reg_num >= sizeof(struct kvm_riscv_core) / sizeof(unsigned long)) - return -EINVAL; - - if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc)) - reg_val = cntx->sepc; - else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num && - reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6)) - reg_val = ((unsigned long *)cntx)[reg_num]; - else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) - reg_val = (cntx->sstatus & SR_SPP) ? - KVM_RISCV_MODE_S : KVM_RISCV_MODE_U; - else - return -EINVAL; - - if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - return 0; -} - -static int kvm_riscv_vcpu_set_reg_core(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) +#ifdef CONFIG_GUEST_PERF_EVENTS +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) { - struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; - unsigned long __user *uaddr = - (unsigned long __user *)(unsigned long)reg->addr; - unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | - KVM_REG_SIZE_MASK | - KVM_REG_RISCV_CORE); - unsigned long reg_val; - - if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) - return -EINVAL; - if (reg_num >= sizeof(struct kvm_riscv_core) / sizeof(unsigned long)) - return -EINVAL; - - if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc)) - cntx->sepc = reg_val; - else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num && - reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6)) - ((unsigned long *)cntx)[reg_num] = reg_val; - else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) { - if (reg_val == KVM_RISCV_MODE_S) - cntx->sstatus |= SR_SPP; - else - cntx->sstatus &= ~SR_SPP; - } else - return -EINVAL; - - return 0; -} - -static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) -{ - struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; - unsigned long __user *uaddr = - (unsigned long __user *)(unsigned long)reg->addr; - unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | - KVM_REG_SIZE_MASK | - KVM_REG_RISCV_CSR); - unsigned long reg_val; - - if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) - return -EINVAL; - if (reg_num >= sizeof(struct kvm_riscv_csr) / sizeof(unsigned long)) - return -EINVAL; - - if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) { - kvm_riscv_vcpu_flush_interrupts(vcpu); - reg_val = (csr->hvip >> VSIP_TO_HVIP_SHIFT) & VSIP_VALID_MASK; - } else - reg_val = ((unsigned long *)csr)[reg_num]; - - if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - return 0; -} - -static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) -{ - struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; - unsigned long __user *uaddr = - (unsigned long __user *)(unsigned long)reg->addr; - unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | - KVM_REG_SIZE_MASK | - KVM_REG_RISCV_CSR); - unsigned long reg_val; - - if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) - return -EINVAL; - if (reg_num >= sizeof(struct kvm_riscv_csr) / sizeof(unsigned long)) - return -EINVAL; - - if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) { - reg_val &= VSIP_VALID_MASK; - reg_val <<= VSIP_TO_HVIP_SHIFT; - } - - ((unsigned long *)csr)[reg_num] = reg_val; - - if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) - WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0); - - return 0; -} - -static int kvm_riscv_vcpu_get_reg_isa_ext(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) -{ - unsigned long __user *uaddr = - (unsigned long __user *)(unsigned long)reg->addr; - unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | - KVM_REG_SIZE_MASK | - KVM_REG_RISCV_ISA_EXT); - unsigned long reg_val = 0; - unsigned long host_isa_ext; - - if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) - return -EINVAL; - - if (reg_num >= KVM_RISCV_ISA_EXT_MAX || - reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) - return -EINVAL; - - host_isa_ext = kvm_isa_ext_arr[reg_num]; - if (__riscv_isa_extension_available(vcpu->arch.isa, host_isa_ext)) - reg_val = 1; /* Mark the given extension as available */ - - if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - return 0; -} - -static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) -{ - unsigned long __user *uaddr = - (unsigned long __user *)(unsigned long)reg->addr; - unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | - KVM_REG_SIZE_MASK | - KVM_REG_RISCV_ISA_EXT); - unsigned long reg_val; - unsigned long host_isa_ext; - - if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) - return -EINVAL; - - if (reg_num >= KVM_RISCV_ISA_EXT_MAX || - reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) - return -EINVAL; - - if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - host_isa_ext = kvm_isa_ext_arr[reg_num]; - if (!__riscv_isa_extension_available(NULL, host_isa_ext)) - return -EOPNOTSUPP; - - if (!vcpu->arch.ran_atleast_once) { - /* - * All multi-letter extension and a few single letter - * extension can be disabled - */ - if (reg_val == 1 && - kvm_riscv_vcpu_isa_enable_allowed(reg_num)) - set_bit(host_isa_ext, vcpu->arch.isa); - else if (!reg_val && - kvm_riscv_vcpu_isa_disable_allowed(reg_num)) - clear_bit(host_isa_ext, vcpu->arch.isa); - else - return -EINVAL; - kvm_riscv_vcpu_fp_reset(vcpu); - } else { - return -EOPNOTSUPP; - } - - return 0; -} - -static int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) -{ - switch (reg->id & KVM_REG_RISCV_TYPE_MASK) { - case KVM_REG_RISCV_CONFIG: - return kvm_riscv_vcpu_set_reg_config(vcpu, reg); - case KVM_REG_RISCV_CORE: - return kvm_riscv_vcpu_set_reg_core(vcpu, reg); - case KVM_REG_RISCV_CSR: - return kvm_riscv_vcpu_set_reg_csr(vcpu, reg); - case KVM_REG_RISCV_TIMER: - return kvm_riscv_vcpu_set_reg_timer(vcpu, reg); - case KVM_REG_RISCV_FP_F: - return kvm_riscv_vcpu_set_reg_fp(vcpu, reg, - KVM_REG_RISCV_FP_F); - case KVM_REG_RISCV_FP_D: - return kvm_riscv_vcpu_set_reg_fp(vcpu, reg, - KVM_REG_RISCV_FP_D); - case KVM_REG_RISCV_ISA_EXT: - return kvm_riscv_vcpu_set_reg_isa_ext(vcpu, reg); - default: - break; - } - - return -EINVAL; + return vcpu->arch.guest_context.sepc; } +#endif -static int kvm_riscv_vcpu_get_reg(struct kvm_vcpu *vcpu, - const struct kvm_one_reg *reg) +vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) { - switch (reg->id & KVM_REG_RISCV_TYPE_MASK) { - case KVM_REG_RISCV_CONFIG: - return kvm_riscv_vcpu_get_reg_config(vcpu, reg); - case KVM_REG_RISCV_CORE: - return kvm_riscv_vcpu_get_reg_core(vcpu, reg); - case KVM_REG_RISCV_CSR: - return kvm_riscv_vcpu_get_reg_csr(vcpu, reg); - case KVM_REG_RISCV_TIMER: - return kvm_riscv_vcpu_get_reg_timer(vcpu, reg); - case KVM_REG_RISCV_FP_F: - return kvm_riscv_vcpu_get_reg_fp(vcpu, reg, - KVM_REG_RISCV_FP_F); - case KVM_REG_RISCV_FP_D: - return kvm_riscv_vcpu_get_reg_fp(vcpu, reg, - KVM_REG_RISCV_FP_D); - case KVM_REG_RISCV_ISA_EXT: - return kvm_riscv_vcpu_get_reg_isa_ext(vcpu, reg); - default: - break; - } - - return -EINVAL; + return VM_FAULT_SIGBUS; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; @@ -671,6 +281,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_riscv_vcpu_get_reg(vcpu, ®); break; } + case KVM_GET_REG_LIST: { + struct kvm_reg_list __user *user_list = argp; + struct kvm_reg_list reg_list; + unsigned int n; + + r = -EFAULT; + if (copy_from_user(®_list, user_list, sizeof(reg_list))) + break; + n = reg_list.n; + reg_list.n = kvm_riscv_vcpu_num_regs(vcpu); + if (copy_to_user(user_list, ®_list, sizeof(reg_list))) + break; + r = -E2BIG; + if (n < reg_list.n) + break; + r = kvm_riscv_vcpu_copy_reg_indices(vcpu, user_list->reg); + break; + } default: break; } @@ -721,13 +349,16 @@ void kvm_riscv_vcpu_flush_interrupts(struct kvm_vcpu *vcpu) struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; unsigned long mask, val; - if (READ_ONCE(vcpu->arch.irqs_pending_mask)) { - mask = xchg_acquire(&vcpu->arch.irqs_pending_mask, 0); - val = READ_ONCE(vcpu->arch.irqs_pending) & mask; + if (READ_ONCE(vcpu->arch.irqs_pending_mask[0])) { + mask = xchg_acquire(&vcpu->arch.irqs_pending_mask[0], 0); + val = READ_ONCE(vcpu->arch.irqs_pending[0]) & mask; csr->hvip &= ~mask; csr->hvip |= val; } + + /* Flush AIA high interrupts */ + kvm_riscv_vcpu_aia_flush_interrupts(vcpu); } void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu) @@ -737,36 +368,53 @@ void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu) struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; /* Read current HVIP and VSIE CSRs */ - csr->vsie = csr_read(CSR_VSIE); + csr->vsie = ncsr_read(CSR_VSIE); /* Sync-up HVIP.VSSIP bit changes does by Guest */ - hvip = csr_read(CSR_HVIP); + hvip = ncsr_read(CSR_HVIP); if ((csr->hvip ^ hvip) & (1UL << IRQ_VS_SOFT)) { if (hvip & (1UL << IRQ_VS_SOFT)) { if (!test_and_set_bit(IRQ_VS_SOFT, - &v->irqs_pending_mask)) - set_bit(IRQ_VS_SOFT, &v->irqs_pending); + v->irqs_pending_mask)) + set_bit(IRQ_VS_SOFT, v->irqs_pending); } else { if (!test_and_set_bit(IRQ_VS_SOFT, - &v->irqs_pending_mask)) - clear_bit(IRQ_VS_SOFT, &v->irqs_pending); + v->irqs_pending_mask)) + clear_bit(IRQ_VS_SOFT, v->irqs_pending); } } + /* Sync up the HVIP.LCOFIP bit changes (only clear) by the guest */ + if ((csr->hvip ^ hvip) & (1UL << IRQ_PMU_OVF)) { + if (!(hvip & (1UL << IRQ_PMU_OVF)) && + !test_and_set_bit(IRQ_PMU_OVF, v->irqs_pending_mask)) + clear_bit(IRQ_PMU_OVF, v->irqs_pending); + } + + /* Sync-up AIA high interrupts */ + kvm_riscv_vcpu_aia_sync_interrupts(vcpu); + /* Sync-up timer CSRs */ kvm_riscv_vcpu_timer_sync(vcpu); } int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) { - if (irq != IRQ_VS_SOFT && + /* + * We only allow VS-mode software, timer, and external + * interrupts when irq is one of the local interrupts + * defined by RISC-V privilege specification. + */ + if (irq < IRQ_LOCAL_MAX && + irq != IRQ_VS_SOFT && irq != IRQ_VS_TIMER && - irq != IRQ_VS_EXT) + irq != IRQ_VS_EXT && + irq != IRQ_PMU_OVF) return -EINVAL; - set_bit(irq, &vcpu->arch.irqs_pending); + set_bit(irq, vcpu->arch.irqs_pending); smp_mb__before_atomic(); - set_bit(irq, &vcpu->arch.irqs_pending_mask); + set_bit(irq, vcpu->arch.irqs_pending_mask); kvm_vcpu_kick(vcpu); @@ -775,46 +423,76 @@ int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) int kvm_riscv_vcpu_unset_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) { - if (irq != IRQ_VS_SOFT && + /* + * We only allow VS-mode software, timer, counter overflow and external + * interrupts when irq is one of the local interrupts + * defined by RISC-V privilege specification. + */ + if (irq < IRQ_LOCAL_MAX && + irq != IRQ_VS_SOFT && irq != IRQ_VS_TIMER && - irq != IRQ_VS_EXT) + irq != IRQ_VS_EXT && + irq != IRQ_PMU_OVF) return -EINVAL; - clear_bit(irq, &vcpu->arch.irqs_pending); + clear_bit(irq, vcpu->arch.irqs_pending); smp_mb__before_atomic(); - set_bit(irq, &vcpu->arch.irqs_pending_mask); + set_bit(irq, vcpu->arch.irqs_pending_mask); return 0; } -bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, unsigned long mask) +bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, u64 mask) { - unsigned long ie = ((vcpu->arch.guest_csr.vsie & VSIP_VALID_MASK) - << VSIP_TO_HVIP_SHIFT) & mask; + unsigned long ie; - return (READ_ONCE(vcpu->arch.irqs_pending) & ie) ? true : false; + ie = ((vcpu->arch.guest_csr.vsie & VSIP_VALID_MASK) + << VSIP_TO_HVIP_SHIFT) & (unsigned long)mask; + ie |= vcpu->arch.guest_csr.vsie & ~IRQ_LOCAL_MASK & + (unsigned long)mask; + if (READ_ONCE(vcpu->arch.irqs_pending[0]) & ie) + return true; + + /* Check AIA high interrupts */ + return kvm_riscv_vcpu_aia_has_interrupts(vcpu, mask); } -void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu) +void __kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu) { - vcpu->arch.power_off = true; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); kvm_make_request(KVM_REQ_SLEEP, vcpu); kvm_vcpu_kick(vcpu); } -void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu) +void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu) { - vcpu->arch.power_off = false; + spin_lock(&vcpu->arch.mp_state_lock); + __kvm_riscv_vcpu_power_off(vcpu); + spin_unlock(&vcpu->arch.mp_state_lock); +} + +void __kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu) +{ + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); kvm_vcpu_wake_up(vcpu); } +void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->arch.mp_state_lock); + __kvm_riscv_vcpu_power_on(vcpu); + spin_unlock(&vcpu->arch.mp_state_lock); +} + +bool kvm_riscv_vcpu_stopped(struct kvm_vcpu *vcpu) +{ + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED; +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (vcpu->arch.power_off) - mp_state->mp_state = KVM_MP_STATE_STOPPED; - else - mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + *mp_state = READ_ONCE(vcpu->arch.mp_state); return 0; } @@ -824,97 +502,194 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { int ret = 0; + spin_lock(&vcpu->arch.mp_state_lock); + switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.power_off = false; + WRITE_ONCE(vcpu->arch.mp_state, *mp_state); break; case KVM_MP_STATE_STOPPED: - kvm_riscv_vcpu_power_off(vcpu); + __kvm_riscv_vcpu_power_off(vcpu); + break; + case KVM_MP_STATE_INIT_RECEIVED: + if (vcpu->kvm->arch.mp_state_reset) + kvm_riscv_reset_vcpu(vcpu, false); + else + ret = -EINVAL; break; default: ret = -EINVAL; } + spin_unlock(&vcpu->arch.mp_state_lock); + return ret; } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - /* TODO; To be implemented later. */ - return -EINVAL; + if (dbg->control & KVM_GUESTDBG_ENABLE) { + vcpu->guest_debug = dbg->control; + vcpu->arch.cfg.hedeleg &= ~BIT(EXC_BREAKPOINT); + } else { + vcpu->guest_debug = 0; + vcpu->arch.cfg.hedeleg |= BIT(EXC_BREAKPOINT); + } + + return 0; } -static void kvm_riscv_vcpu_update_config(const unsigned long *isa) +static void kvm_riscv_vcpu_setup_config(struct kvm_vcpu *vcpu) { - u64 henvcfg = 0; + const unsigned long *isa = vcpu->arch.isa; + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; if (riscv_isa_extension_available(isa, SVPBMT)) - henvcfg |= ENVCFG_PBMTE; + cfg->henvcfg |= ENVCFG_PBMTE; if (riscv_isa_extension_available(isa, SSTC)) - henvcfg |= ENVCFG_STCE; + cfg->henvcfg |= ENVCFG_STCE; if (riscv_isa_extension_available(isa, ZICBOM)) - henvcfg |= (ENVCFG_CBIE | ENVCFG_CBCFE); + cfg->henvcfg |= (ENVCFG_CBIE | ENVCFG_CBCFE); + + if (riscv_isa_extension_available(isa, ZICBOZ)) + cfg->henvcfg |= ENVCFG_CBZE; + + if (riscv_isa_extension_available(isa, SVADU) && + !riscv_isa_extension_available(isa, SVADE)) + cfg->henvcfg |= ENVCFG_ADUE; + + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + cfg->hstateen0 |= SMSTATEEN0_HSENVCFG; + if (riscv_isa_extension_available(isa, SSAIA)) + cfg->hstateen0 |= SMSTATEEN0_AIA_IMSIC | + SMSTATEEN0_AIA | + SMSTATEEN0_AIA_ISEL; + if (riscv_isa_extension_available(isa, SMSTATEEN)) + cfg->hstateen0 |= SMSTATEEN0_SSTATEEN0; + } - csr_write(CSR_HENVCFG, henvcfg); -#ifdef CONFIG_32BIT - csr_write(CSR_HENVCFGH, henvcfg >> 32); -#endif + if (vcpu->guest_debug) + cfg->hedeleg &= ~BIT(EXC_BREAKPOINT); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + void *nsh; struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + if (kvm_riscv_nacl_sync_csr_available()) { + nsh = nacl_shmem(); + nacl_csr_write(nsh, CSR_VSSTATUS, csr->vsstatus); + nacl_csr_write(nsh, CSR_VSIE, csr->vsie); + nacl_csr_write(nsh, CSR_VSTVEC, csr->vstvec); + nacl_csr_write(nsh, CSR_VSSCRATCH, csr->vsscratch); + nacl_csr_write(nsh, CSR_VSEPC, csr->vsepc); + nacl_csr_write(nsh, CSR_VSCAUSE, csr->vscause); + nacl_csr_write(nsh, CSR_VSTVAL, csr->vstval); + nacl_csr_write(nsh, CSR_HEDELEG, cfg->hedeleg); + nacl_csr_write(nsh, CSR_HVIP, csr->hvip); + nacl_csr_write(nsh, CSR_VSATP, csr->vsatp); + nacl_csr_write(nsh, CSR_HENVCFG, cfg->henvcfg); + if (IS_ENABLED(CONFIG_32BIT)) + nacl_csr_write(nsh, CSR_HENVCFGH, cfg->henvcfg >> 32); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + nacl_csr_write(nsh, CSR_HSTATEEN0, cfg->hstateen0); + if (IS_ENABLED(CONFIG_32BIT)) + nacl_csr_write(nsh, CSR_HSTATEEN0H, cfg->hstateen0 >> 32); + } + } else { + csr_write(CSR_VSSTATUS, csr->vsstatus); + csr_write(CSR_VSIE, csr->vsie); + csr_write(CSR_VSTVEC, csr->vstvec); + csr_write(CSR_VSSCRATCH, csr->vsscratch); + csr_write(CSR_VSEPC, csr->vsepc); + csr_write(CSR_VSCAUSE, csr->vscause); + csr_write(CSR_VSTVAL, csr->vstval); + csr_write(CSR_HEDELEG, cfg->hedeleg); + csr_write(CSR_HVIP, csr->hvip); + csr_write(CSR_VSATP, csr->vsatp); + csr_write(CSR_HENVCFG, cfg->henvcfg); + if (IS_ENABLED(CONFIG_32BIT)) + csr_write(CSR_HENVCFGH, cfg->henvcfg >> 32); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + csr_write(CSR_HSTATEEN0, cfg->hstateen0); + if (IS_ENABLED(CONFIG_32BIT)) + csr_write(CSR_HSTATEEN0H, cfg->hstateen0 >> 32); + } + } - csr_write(CSR_VSSTATUS, csr->vsstatus); - csr_write(CSR_VSIE, csr->vsie); - csr_write(CSR_VSTVEC, csr->vstvec); - csr_write(CSR_VSSCRATCH, csr->vsscratch); - csr_write(CSR_VSEPC, csr->vsepc); - csr_write(CSR_VSCAUSE, csr->vscause); - csr_write(CSR_VSTVAL, csr->vstval); - csr_write(CSR_HVIP, csr->hvip); - csr_write(CSR_VSATP, csr->vsatp); - - kvm_riscv_vcpu_update_config(vcpu->arch.isa); - - kvm_riscv_gstage_update_hgatp(vcpu); + kvm_riscv_mmu_update_hgatp(vcpu); kvm_riscv_vcpu_timer_restore(vcpu); kvm_riscv_vcpu_host_fp_save(&vcpu->arch.host_context); kvm_riscv_vcpu_guest_fp_restore(&vcpu->arch.guest_context, vcpu->arch.isa); + kvm_riscv_vcpu_host_vector_save(&vcpu->arch.host_context); + kvm_riscv_vcpu_guest_vector_restore(&vcpu->arch.guest_context, + vcpu->arch.isa); + + kvm_riscv_vcpu_aia_load(vcpu, cpu); + + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); vcpu->cpu = cpu; } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + void *nsh; struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; vcpu->cpu = -1; + kvm_riscv_vcpu_aia_put(vcpu); + kvm_riscv_vcpu_guest_fp_save(&vcpu->arch.guest_context, vcpu->arch.isa); kvm_riscv_vcpu_host_fp_restore(&vcpu->arch.host_context); kvm_riscv_vcpu_timer_save(vcpu); - - csr->vsstatus = csr_read(CSR_VSSTATUS); - csr->vsie = csr_read(CSR_VSIE); - csr->vstvec = csr_read(CSR_VSTVEC); - csr->vsscratch = csr_read(CSR_VSSCRATCH); - csr->vsepc = csr_read(CSR_VSEPC); - csr->vscause = csr_read(CSR_VSCAUSE); - csr->vstval = csr_read(CSR_VSTVAL); - csr->hvip = csr_read(CSR_HVIP); - csr->vsatp = csr_read(CSR_VSATP); + kvm_riscv_vcpu_guest_vector_save(&vcpu->arch.guest_context, + vcpu->arch.isa); + kvm_riscv_vcpu_host_vector_restore(&vcpu->arch.host_context); + + if (kvm_riscv_nacl_available()) { + nsh = nacl_shmem(); + csr->vsstatus = nacl_csr_read(nsh, CSR_VSSTATUS); + csr->vsie = nacl_csr_read(nsh, CSR_VSIE); + csr->vstvec = nacl_csr_read(nsh, CSR_VSTVEC); + csr->vsscratch = nacl_csr_read(nsh, CSR_VSSCRATCH); + csr->vsepc = nacl_csr_read(nsh, CSR_VSEPC); + csr->vscause = nacl_csr_read(nsh, CSR_VSCAUSE); + csr->vstval = nacl_csr_read(nsh, CSR_VSTVAL); + csr->hvip = nacl_csr_read(nsh, CSR_HVIP); + csr->vsatp = nacl_csr_read(nsh, CSR_VSATP); + } else { + csr->vsstatus = csr_read(CSR_VSSTATUS); + csr->vsie = csr_read(CSR_VSIE); + csr->vstvec = csr_read(CSR_VSTVEC); + csr->vsscratch = csr_read(CSR_VSSCRATCH); + csr->vsepc = csr_read(CSR_VSEPC); + csr->vscause = csr_read(CSR_VSCAUSE); + csr->vstval = csr_read(CSR_VSTVAL); + csr->hvip = csr_read(CSR_HVIP); + csr->vsatp = csr_read(CSR_VSATP); + } } -static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) +/** + * kvm_riscv_check_vcpu_requests - check and handle pending vCPU requests + * @vcpu: the VCPU pointer + * + * Return: 1 if we should enter the guest + * 0 if we should exit to userspace + */ +static int kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) { struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); @@ -922,11 +697,11 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) { kvm_vcpu_srcu_read_unlock(vcpu); rcuwait_wait_event(wait, - (!vcpu->arch.power_off) && (!vcpu->arch.pause), + (!kvm_riscv_vcpu_stopped(vcpu)) && (!vcpu->arch.pause), TASK_INTERRUPTIBLE); kvm_vcpu_srcu_read_lock(vcpu); - if (vcpu->arch.power_off || vcpu->arch.pause) { + if (kvm_riscv_vcpu_stopped(vcpu) || vcpu->arch.pause) { /* * Awaken to handle a signal, request to * sleep again later. @@ -936,34 +711,67 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) - kvm_riscv_reset_vcpu(vcpu); + kvm_riscv_reset_vcpu(vcpu, true); if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu)) - kvm_riscv_gstage_update_hgatp(vcpu); + kvm_riscv_mmu_update_hgatp(vcpu); if (kvm_check_request(KVM_REQ_FENCE_I, vcpu)) kvm_riscv_fence_i_process(vcpu); - /* - * The generic KVM_REQ_TLB_FLUSH is same as - * KVM_REQ_HFENCE_GVMA_VMID_ALL - */ - if (kvm_check_request(KVM_REQ_HFENCE_GVMA_VMID_ALL, vcpu)) - kvm_riscv_hfence_gvma_vmid_all_process(vcpu); + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvm_riscv_tlb_flush_process(vcpu); if (kvm_check_request(KVM_REQ_HFENCE_VVMA_ALL, vcpu)) kvm_riscv_hfence_vvma_all_process(vcpu); if (kvm_check_request(KVM_REQ_HFENCE, vcpu)) kvm_riscv_hfence_process(vcpu); + + if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) + kvm_riscv_vcpu_record_steal_time(vcpu); + + if (kvm_dirty_ring_check_request(vcpu)) + return 0; } + + return 1; } static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu) { struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; - csr_write(CSR_HVIP, csr->hvip); + ncsr_write(CSR_HVIP, csr->hvip); + kvm_riscv_vcpu_aia_update_hvip(vcpu); +} + +static __always_inline void kvm_riscv_vcpu_swap_in_guest_state(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_smstateen_csr *smcsr = &vcpu->arch.smstateen_csr; + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + vcpu->arch.host_scounteren = csr_swap(CSR_SCOUNTEREN, csr->scounteren); + vcpu->arch.host_senvcfg = csr_swap(CSR_SENVCFG, csr->senvcfg); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN) && + (cfg->hstateen0 & SMSTATEEN0_SSTATEEN0)) + vcpu->arch.host_sstateen0 = csr_swap(CSR_SSTATEEN0, + smcsr->sstateen0); +} + +static __always_inline void kvm_riscv_vcpu_swap_in_host_state(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_smstateen_csr *smcsr = &vcpu->arch.smstateen_csr; + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + csr->scounteren = csr_swap(CSR_SCOUNTEREN, vcpu->arch.host_scounteren); + csr->senvcfg = csr_swap(CSR_SENVCFG, vcpu->arch.host_senvcfg); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN) && + (cfg->hstateen0 & SMSTATEEN0_SSTATEEN0)) + smcsr->sstateen0 = csr_swap(CSR_SSTATEEN0, + vcpu->arch.host_sstateen0); } /* @@ -973,12 +781,84 @@ static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu) * This must be noinstr as instrumentation may make use of RCU, and this is not * safe during the EQS. */ -static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu) -{ +static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct kvm_cpu_trap *trap) +{ + void *nsh; + struct kvm_cpu_context *gcntx = &vcpu->arch.guest_context; + struct kvm_cpu_context *hcntx = &vcpu->arch.host_context; + + /* + * We save trap CSRs (such as SEPC, SCAUSE, STVAL, HTVAL, and + * HTINST) here because we do local_irq_enable() after this + * function in kvm_arch_vcpu_ioctl_run() which can result in + * an interrupt immediately after local_irq_enable() and can + * potentially change trap CSRs. + */ + + kvm_riscv_vcpu_swap_in_guest_state(vcpu); guest_state_enter_irqoff(); - __kvm_riscv_switch_to(&vcpu->arch); + + if (kvm_riscv_nacl_sync_sret_available()) { + nsh = nacl_shmem(); + + if (kvm_riscv_nacl_autoswap_csr_available()) { + hcntx->hstatus = + nacl_csr_read(nsh, CSR_HSTATUS); + nacl_scratch_write_long(nsh, + SBI_NACL_SHMEM_AUTOSWAP_OFFSET + + SBI_NACL_SHMEM_AUTOSWAP_HSTATUS, + gcntx->hstatus); + nacl_scratch_write_long(nsh, + SBI_NACL_SHMEM_AUTOSWAP_OFFSET, + SBI_NACL_SHMEM_AUTOSWAP_FLAG_HSTATUS); + } else if (kvm_riscv_nacl_sync_csr_available()) { + hcntx->hstatus = nacl_csr_swap(nsh, + CSR_HSTATUS, gcntx->hstatus); + } else { + hcntx->hstatus = csr_swap(CSR_HSTATUS, gcntx->hstatus); + } + + nacl_scratch_write_longs(nsh, + SBI_NACL_SHMEM_SRET_OFFSET + + SBI_NACL_SHMEM_SRET_X(1), + &gcntx->ra, + SBI_NACL_SHMEM_SRET_X_LAST); + + __kvm_riscv_nacl_switch_to(&vcpu->arch, SBI_EXT_NACL, + SBI_EXT_NACL_SYNC_SRET); + + if (kvm_riscv_nacl_autoswap_csr_available()) { + nacl_scratch_write_long(nsh, + SBI_NACL_SHMEM_AUTOSWAP_OFFSET, + 0); + gcntx->hstatus = nacl_scratch_read_long(nsh, + SBI_NACL_SHMEM_AUTOSWAP_OFFSET + + SBI_NACL_SHMEM_AUTOSWAP_HSTATUS); + } else { + gcntx->hstatus = csr_swap(CSR_HSTATUS, hcntx->hstatus); + } + + trap->htval = nacl_csr_read(nsh, CSR_HTVAL); + trap->htinst = nacl_csr_read(nsh, CSR_HTINST); + } else { + hcntx->hstatus = csr_swap(CSR_HSTATUS, gcntx->hstatus); + + __kvm_riscv_switch_to(&vcpu->arch); + + gcntx->hstatus = csr_swap(CSR_HSTATUS, hcntx->hstatus); + + trap->htval = csr_read(CSR_HTVAL); + trap->htinst = csr_read(CSR_HTINST); + } + + trap->sepc = gcntx->sepc; + trap->scause = csr_read(CSR_SCAUSE); + trap->stval = csr_read(CSR_STVAL); + vcpu->arch.last_exit_cpu = vcpu->cpu; guest_state_exit_irqoff(); + kvm_riscv_vcpu_swap_in_host_state(vcpu); } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) @@ -987,6 +867,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) struct kvm_cpu_trap trap; struct kvm_run *run = vcpu->run; + if (!vcpu->arch.ran_atleast_once) + kvm_riscv_vcpu_setup_config(vcpu); + /* Mark this VCPU ran at least once */ vcpu->arch.ran_atleast_once = true; @@ -1014,7 +897,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) return ret; } - if (run->immediate_exit) { + if (!vcpu->wants_to_run) { kvm_vcpu_srcu_read_unlock(vcpu); return -EINTR; } @@ -1027,14 +910,25 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_UNKNOWN; while (ret > 0) { /* Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (ret) continue; ret = 1; kvm_riscv_gstage_vmid_update(vcpu); - kvm_riscv_check_vcpu_requests(vcpu); + ret = kvm_riscv_check_vcpu_requests(vcpu); + if (ret <= 0) + continue; + + preempt_disable(); + + /* Update AIA HW state before entering guest */ + ret = kvm_riscv_vcpu_aia_update(vcpu); + if (ret <= 0) { + preempt_enable(); + continue; + } local_irq_disable(); @@ -1058,47 +952,36 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Update HVIP CSR for current CPU */ kvm_riscv_update_hvip(vcpu); - if (ret <= 0 || - kvm_riscv_gstage_vmid_ver_changed(&vcpu->kvm->arch.vmid) || + if (kvm_riscv_gstage_vmid_ver_changed(&vcpu->kvm->arch.vmid) || kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) { vcpu->mode = OUTSIDE_GUEST_MODE; local_irq_enable(); + preempt_enable(); kvm_vcpu_srcu_read_lock(vcpu); continue; } /* - * Cleanup stale TLB enteries + * Sanitize VMID mappings cached (TLB) on current CPU * * Note: This should be done after G-stage VMID has been * updated using kvm_riscv_gstage_vmid_ver_changed() */ kvm_riscv_local_tlb_sanitize(vcpu); + trace_kvm_entry(vcpu); + guest_timing_enter_irqoff(); - kvm_riscv_vcpu_enter_exit(vcpu); + kvm_riscv_vcpu_enter_exit(vcpu, &trap); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; - /* - * Save SCAUSE, STVAL, HTVAL, and HTINST because we might - * get an interrupt between __kvm_riscv_switch_to() and - * local_irq_enable() which can potentially change CSRs. - */ - trap.sepc = vcpu->arch.guest_context.sepc; - trap.scause = csr_read(CSR_SCAUSE); - trap.stval = csr_read(CSR_STVAL); - trap.htval = csr_read(CSR_HTVAL); - trap.htinst = csr_read(CSR_HTINST); - /* Syncup interrupts state with HW */ kvm_riscv_vcpu_sync_interrupts(vcpu); - preempt_disable(); - /* * We must ensure that any pending interrupts are taken before * we exit guest timing so that timer ticks are accounted as @@ -1116,6 +999,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) local_irq_enable(); + trace_kvm_exit(&trap); + preempt_enable(); kvm_vcpu_srcu_read_lock(vcpu); diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index c9f741ab26f5..0bb0c51e3c89 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -9,10 +9,13 @@ #include <linux/kvm_host.h> #include <asm/csr.h> #include <asm/insn-def.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nacl.h> static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_trap *trap) { + struct kvm_gstage_mapping host_map; struct kvm_memory_slot *memslot; unsigned long hva, fault_addr; bool writable; @@ -40,8 +43,9 @@ static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, }; } - ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva, - (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false); + ret = kvm_riscv_mmu_map(vcpu, memslot, fault_addr, hva, + (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false, + &host_map); if (ret < 0) return ret; @@ -135,7 +139,7 @@ unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu, void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu, struct kvm_cpu_trap *trap) { - unsigned long vsstatus = csr_read(CSR_VSSTATUS); + unsigned long vsstatus = ncsr_read(CSR_VSSTATUS); /* Change Guest SSTATUS.SPP bit */ vsstatus &= ~SR_SPP; @@ -151,15 +155,29 @@ void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu, vsstatus &= ~SR_SIE; /* Update Guest SSTATUS */ - csr_write(CSR_VSSTATUS, vsstatus); + ncsr_write(CSR_VSSTATUS, vsstatus); /* Update Guest SCAUSE, STVAL, and SEPC */ - csr_write(CSR_VSCAUSE, trap->scause); - csr_write(CSR_VSTVAL, trap->stval); - csr_write(CSR_VSEPC, trap->sepc); + ncsr_write(CSR_VSCAUSE, trap->scause); + ncsr_write(CSR_VSTVAL, trap->stval); + ncsr_write(CSR_VSEPC, trap->sepc); /* Set Guest PC to Guest exception vector */ - vcpu->arch.guest_context.sepc = csr_read(CSR_VSTVEC); + vcpu->arch.guest_context.sepc = ncsr_read(CSR_VSTVEC); + + /* Set Guest privilege mode to supervisor */ + vcpu->arch.guest_context.sstatus |= SR_SPP; +} + +static inline int vcpu_redirect(struct kvm_vcpu *vcpu, struct kvm_cpu_trap *trap) +{ + int ret = -EFAULT; + + if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) { + kvm_riscv_vcpu_trap_redirect(vcpu, trap); + ret = 1; + } + return ret; } /* @@ -179,6 +197,34 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, ret = -EFAULT; run->exit_reason = KVM_EXIT_UNKNOWN; switch (trap->scause) { + case EXC_INST_ILLEGAL: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_ILLEGAL_INSN); + vcpu->stat.instr_illegal_exits++; + ret = vcpu_redirect(vcpu, trap); + break; + case EXC_LOAD_MISALIGNED: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_MISALIGNED_LOAD); + vcpu->stat.load_misaligned_exits++; + ret = vcpu_redirect(vcpu, trap); + break; + case EXC_STORE_MISALIGNED: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_MISALIGNED_STORE); + vcpu->stat.store_misaligned_exits++; + ret = vcpu_redirect(vcpu, trap); + break; + case EXC_LOAD_ACCESS: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_ACCESS_LOAD); + vcpu->stat.load_access_exits++; + ret = vcpu_redirect(vcpu, trap); + break; + case EXC_STORE_ACCESS: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_ACCESS_STORE); + vcpu->stat.store_access_exits++; + ret = vcpu_redirect(vcpu, trap); + break; + case EXC_INST_ACCESS: + ret = vcpu_redirect(vcpu, trap); + break; case EXC_VIRTUAL_INST_FAULT: if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) ret = kvm_riscv_vcpu_virtual_insn(vcpu, run, trap); @@ -193,6 +239,10 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) ret = kvm_riscv_vcpu_sbi_ecall(vcpu, run); break; + case EXC_BREAKPOINT: + run->exit_reason = KVM_EXIT_DEBUG; + ret = 0; + break; default: break; } diff --git a/arch/riscv/kvm/vcpu_fp.c b/arch/riscv/kvm/vcpu_fp.c index 9d8cbc42057a..030904d82b58 100644 --- a/arch/riscv/kvm/vcpu_fp.c +++ b/arch/riscv/kvm/vcpu_fp.c @@ -11,7 +11,7 @@ #include <linux/err.h> #include <linux/kvm_host.h> #include <linux/uaccess.h> -#include <asm/hwcap.h> +#include <asm/cpufeature.h> #ifdef CONFIG_FPU void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu) @@ -96,7 +96,7 @@ int kvm_riscv_vcpu_get_reg_fp(struct kvm_vcpu *vcpu, reg_num <= KVM_REG_RISCV_FP_F_REG(f[31])) reg_val = &cntx->fp.f.f[reg_num]; else - return -EINVAL; + return -ENOENT; } else if ((rtype == KVM_REG_RISCV_FP_D) && riscv_isa_extension_available(vcpu->arch.isa, d)) { if (reg_num == KVM_REG_RISCV_FP_D_REG(fcsr)) { @@ -109,9 +109,9 @@ int kvm_riscv_vcpu_get_reg_fp(struct kvm_vcpu *vcpu, return -EINVAL; reg_val = &cntx->fp.d.f[reg_num]; } else - return -EINVAL; + return -ENOENT; } else - return -EINVAL; + return -ENOENT; if (copy_to_user(uaddr, reg_val, KVM_REG_SIZE(reg->id))) return -EFAULT; @@ -141,7 +141,7 @@ int kvm_riscv_vcpu_set_reg_fp(struct kvm_vcpu *vcpu, reg_num <= KVM_REG_RISCV_FP_F_REG(f[31])) reg_val = &cntx->fp.f.f[reg_num]; else - return -EINVAL; + return -ENOENT; } else if ((rtype == KVM_REG_RISCV_FP_D) && riscv_isa_extension_available(vcpu->arch.isa, d)) { if (reg_num == KVM_REG_RISCV_FP_D_REG(fcsr)) { @@ -154,9 +154,9 @@ int kvm_riscv_vcpu_set_reg_fp(struct kvm_vcpu *vcpu, return -EINVAL; reg_val = &cntx->fp.d.f[reg_num]; } else - return -EINVAL; + return -ENOENT; } else - return -EINVAL; + return -ENOENT; if (copy_from_user(reg_val, uaddr, KVM_REG_SIZE(reg->id))) return -EFAULT; diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c index 0bb52761a3f7..4d89b94128ae 100644 --- a/arch/riscv/kvm/vcpu_insn.c +++ b/arch/riscv/kvm/vcpu_insn.c @@ -7,129 +7,8 @@ #include <linux/bitops.h> #include <linux/kvm_host.h> -#define INSN_OPCODE_MASK 0x007c -#define INSN_OPCODE_SHIFT 2 -#define INSN_OPCODE_SYSTEM 28 - -#define INSN_MASK_WFI 0xffffffff -#define INSN_MATCH_WFI 0x10500073 - -#define INSN_MATCH_CSRRW 0x1073 -#define INSN_MASK_CSRRW 0x707f -#define INSN_MATCH_CSRRS 0x2073 -#define INSN_MASK_CSRRS 0x707f -#define INSN_MATCH_CSRRC 0x3073 -#define INSN_MASK_CSRRC 0x707f -#define INSN_MATCH_CSRRWI 0x5073 -#define INSN_MASK_CSRRWI 0x707f -#define INSN_MATCH_CSRRSI 0x6073 -#define INSN_MASK_CSRRSI 0x707f -#define INSN_MATCH_CSRRCI 0x7073 -#define INSN_MASK_CSRRCI 0x707f - -#define INSN_MATCH_LB 0x3 -#define INSN_MASK_LB 0x707f -#define INSN_MATCH_LH 0x1003 -#define INSN_MASK_LH 0x707f -#define INSN_MATCH_LW 0x2003 -#define INSN_MASK_LW 0x707f -#define INSN_MATCH_LD 0x3003 -#define INSN_MASK_LD 0x707f -#define INSN_MATCH_LBU 0x4003 -#define INSN_MASK_LBU 0x707f -#define INSN_MATCH_LHU 0x5003 -#define INSN_MASK_LHU 0x707f -#define INSN_MATCH_LWU 0x6003 -#define INSN_MASK_LWU 0x707f -#define INSN_MATCH_SB 0x23 -#define INSN_MASK_SB 0x707f -#define INSN_MATCH_SH 0x1023 -#define INSN_MASK_SH 0x707f -#define INSN_MATCH_SW 0x2023 -#define INSN_MASK_SW 0x707f -#define INSN_MATCH_SD 0x3023 -#define INSN_MASK_SD 0x707f - -#define INSN_MATCH_C_LD 0x6000 -#define INSN_MASK_C_LD 0xe003 -#define INSN_MATCH_C_SD 0xe000 -#define INSN_MASK_C_SD 0xe003 -#define INSN_MATCH_C_LW 0x4000 -#define INSN_MASK_C_LW 0xe003 -#define INSN_MATCH_C_SW 0xc000 -#define INSN_MASK_C_SW 0xe003 -#define INSN_MATCH_C_LDSP 0x6002 -#define INSN_MASK_C_LDSP 0xe003 -#define INSN_MATCH_C_SDSP 0xe002 -#define INSN_MASK_C_SDSP 0xe003 -#define INSN_MATCH_C_LWSP 0x4002 -#define INSN_MASK_C_LWSP 0xe003 -#define INSN_MATCH_C_SWSP 0xc002 -#define INSN_MASK_C_SWSP 0xe003 - -#define INSN_16BIT_MASK 0x3 - -#define INSN_IS_16BIT(insn) (((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK) - -#define INSN_LEN(insn) (INSN_IS_16BIT(insn) ? 2 : 4) - -#ifdef CONFIG_64BIT -#define LOG_REGBYTES 3 -#else -#define LOG_REGBYTES 2 -#endif -#define REGBYTES (1 << LOG_REGBYTES) - -#define SH_RD 7 -#define SH_RS1 15 -#define SH_RS2 20 -#define SH_RS2C 2 -#define MASK_RX 0x1f - -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) -#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ - (RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 1) << 6)) -#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 2) << 6)) -#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 2) << 6)) -#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 3) << 6)) -#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ - (RV_X(x, 7, 2) << 6)) -#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 7, 3) << 6)) -#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) -#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) -#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) - -#define SHIFT_RIGHT(x, y) \ - ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) - -#define REG_MASK \ - ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) - -#define REG_OFFSET(insn, pos) \ - (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) - -#define REG_PTR(insn, pos, regs) \ - ((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))) - -#define GET_FUNCT3(insn) (((insn) >> 12) & 7) - -#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) -#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) -#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) -#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) -#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) -#define GET_SP(regs) (*REG_PTR(2, 0, regs)) -#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) -#define IMM_I(insn) ((s32)(insn) >> 20) -#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ - (s32)(((insn) >> 7) & 0x1f)) +#include <asm/cpufeature.h> +#include <asm/insn.h> struct insn_func { unsigned long mask; @@ -201,6 +80,13 @@ static int wfi_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn) return KVM_INSN_CONTINUE_NEXT_SEPC; } +static int wrs_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn) +{ + vcpu->stat.wrs_exit_stat++; + kvm_vcpu_on_spin(vcpu, vcpu->arch.guest_context.sstatus & SR_SPP); + return KVM_INSN_CONTINUE_NEXT_SEPC; +} + struct csr_func { unsigned int base; unsigned int count; @@ -213,7 +99,21 @@ struct csr_func { unsigned long wr_mask); }; -static const struct csr_func csr_funcs[] = { }; +static int seed_csr_rmw(struct kvm_vcpu *vcpu, unsigned int csr_num, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + if (!riscv_isa_extension_available(vcpu->arch.isa, ZKR)) + return KVM_INSN_ILLEGAL_TRAP; + + return KVM_INSN_EXIT_TO_USER_SPACE; +} + +static const struct csr_func csr_funcs[] = { + KVM_RISCV_VCPU_AIA_CSR_FUNCS + KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS + { .base = CSR_SEED, .count = 1, .func = seed_csr_rmw }, +}; /** * kvm_riscv_vcpu_csr_return -- Handle CSR read/write after user space @@ -362,6 +262,11 @@ static const struct insn_func system_opcode_funcs[] = { .match = INSN_MATCH_WFI, .func = wfi_insn, }, + { + .mask = INSN_MASK_WRS, + .match = INSN_MATCH_WRS, + .func = wrs_insn, + }, }; static int system_opcode_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, @@ -393,6 +298,22 @@ static int system_opcode_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, return (rc <= 0) ? rc : 1; } +static bool is_load_guest_page_fault(unsigned long scause) +{ + /** + * If a g-stage page fault occurs, the direct approach + * is to let the g-stage page fault handler handle it + * naturally, however, calling the g-stage page fault + * handler here seems rather strange. + * Considering this is a corner case, we can directly + * return to the guest and re-execute the same PC, this + * will trigger a g-stage page fault again and then the + * regular g-stage page fault handler will populate + * g-stage page table. + */ + return (scause == EXC_LOAD_GUEST_PAGE_FAULT); +} + /** * kvm_riscv_vcpu_virtual_insn -- Handle virtual instruction trap * @@ -418,6 +339,8 @@ int kvm_riscv_vcpu_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ct->sepc, &utrap); if (utrap.scause) { + if (is_load_guest_page_fault(utrap.scause)) + return 1; utrap.sepc = ct->sepc; kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); return 1; @@ -473,6 +396,8 @@ int kvm_riscv_vcpu_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run, insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, &utrap); if (utrap.scause) { + if (is_load_guest_page_fault(utrap.scause)) + return 1; /* Redirect trap if we failed to read instruction */ utrap.sepc = ct->sepc; kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); @@ -599,6 +524,8 @@ int kvm_riscv_vcpu_mmio_store(struct kvm_vcpu *vcpu, struct kvm_run *run, insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, &utrap); if (utrap.scause) { + if (is_load_guest_page_fault(utrap.scause)) + return 1; /* Redirect trap if we failed to read instruction */ utrap.sepc = ct->sepc; kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c new file mode 100644 index 000000000000..865dae903aa0 --- /dev/null +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -0,0 +1,1292 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * Copyright (C) 2023 Ventana Micro Systems Inc. + * + * Authors: + * Anup Patel <apatel@ventanamicro.com> + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/uaccess.h> +#include <linux/kvm_host.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/kvm_vcpu_vector.h> +#include <asm/pgtable.h> +#include <asm/vector.h> + +#define KVM_RISCV_BASE_ISA_MASK GENMASK(25, 0) + +#define KVM_ISA_EXT_ARR(ext) \ +[KVM_RISCV_ISA_EXT_##ext] = RISCV_ISA_EXT_##ext + +/* Mapping between KVM ISA Extension ID & guest ISA extension ID */ +static const unsigned long kvm_isa_ext_arr[] = { + /* Single letter extensions (alphabetically sorted) */ + [KVM_RISCV_ISA_EXT_A] = RISCV_ISA_EXT_a, + [KVM_RISCV_ISA_EXT_C] = RISCV_ISA_EXT_c, + [KVM_RISCV_ISA_EXT_D] = RISCV_ISA_EXT_d, + [KVM_RISCV_ISA_EXT_F] = RISCV_ISA_EXT_f, + [KVM_RISCV_ISA_EXT_H] = RISCV_ISA_EXT_h, + [KVM_RISCV_ISA_EXT_I] = RISCV_ISA_EXT_i, + [KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m, + [KVM_RISCV_ISA_EXT_V] = RISCV_ISA_EXT_v, + /* Multi letter extensions (alphabetically sorted) */ + KVM_ISA_EXT_ARR(SMNPM), + KVM_ISA_EXT_ARR(SMSTATEEN), + KVM_ISA_EXT_ARR(SSAIA), + KVM_ISA_EXT_ARR(SSCOFPMF), + KVM_ISA_EXT_ARR(SSNPM), + KVM_ISA_EXT_ARR(SSTC), + KVM_ISA_EXT_ARR(SVADE), + KVM_ISA_EXT_ARR(SVADU), + KVM_ISA_EXT_ARR(SVINVAL), + KVM_ISA_EXT_ARR(SVNAPOT), + KVM_ISA_EXT_ARR(SVPBMT), + KVM_ISA_EXT_ARR(SVVPTC), + KVM_ISA_EXT_ARR(ZAAMO), + KVM_ISA_EXT_ARR(ZABHA), + KVM_ISA_EXT_ARR(ZACAS), + KVM_ISA_EXT_ARR(ZALRSC), + KVM_ISA_EXT_ARR(ZAWRS), + KVM_ISA_EXT_ARR(ZBA), + KVM_ISA_EXT_ARR(ZBB), + KVM_ISA_EXT_ARR(ZBC), + KVM_ISA_EXT_ARR(ZBKB), + KVM_ISA_EXT_ARR(ZBKC), + KVM_ISA_EXT_ARR(ZBKX), + KVM_ISA_EXT_ARR(ZBS), + KVM_ISA_EXT_ARR(ZCA), + KVM_ISA_EXT_ARR(ZCB), + KVM_ISA_EXT_ARR(ZCD), + KVM_ISA_EXT_ARR(ZCF), + KVM_ISA_EXT_ARR(ZCMOP), + KVM_ISA_EXT_ARR(ZFA), + KVM_ISA_EXT_ARR(ZFBFMIN), + KVM_ISA_EXT_ARR(ZFH), + KVM_ISA_EXT_ARR(ZFHMIN), + KVM_ISA_EXT_ARR(ZICBOM), + KVM_ISA_EXT_ARR(ZICBOP), + KVM_ISA_EXT_ARR(ZICBOZ), + KVM_ISA_EXT_ARR(ZICCRSE), + KVM_ISA_EXT_ARR(ZICNTR), + KVM_ISA_EXT_ARR(ZICOND), + KVM_ISA_EXT_ARR(ZICSR), + KVM_ISA_EXT_ARR(ZIFENCEI), + KVM_ISA_EXT_ARR(ZIHINTNTL), + KVM_ISA_EXT_ARR(ZIHINTPAUSE), + KVM_ISA_EXT_ARR(ZIHPM), + KVM_ISA_EXT_ARR(ZIMOP), + KVM_ISA_EXT_ARR(ZKND), + KVM_ISA_EXT_ARR(ZKNE), + KVM_ISA_EXT_ARR(ZKNH), + KVM_ISA_EXT_ARR(ZKR), + KVM_ISA_EXT_ARR(ZKSED), + KVM_ISA_EXT_ARR(ZKSH), + KVM_ISA_EXT_ARR(ZKT), + KVM_ISA_EXT_ARR(ZTSO), + KVM_ISA_EXT_ARR(ZVBB), + KVM_ISA_EXT_ARR(ZVBC), + KVM_ISA_EXT_ARR(ZVFBFMIN), + KVM_ISA_EXT_ARR(ZVFBFWMA), + KVM_ISA_EXT_ARR(ZVFH), + KVM_ISA_EXT_ARR(ZVFHMIN), + KVM_ISA_EXT_ARR(ZVKB), + KVM_ISA_EXT_ARR(ZVKG), + KVM_ISA_EXT_ARR(ZVKNED), + KVM_ISA_EXT_ARR(ZVKNHA), + KVM_ISA_EXT_ARR(ZVKNHB), + KVM_ISA_EXT_ARR(ZVKSED), + KVM_ISA_EXT_ARR(ZVKSH), + KVM_ISA_EXT_ARR(ZVKT), +}; + +static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) +{ + unsigned long i; + + for (i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) { + if (kvm_isa_ext_arr[i] == base_ext) + return i; + } + + return KVM_RISCV_ISA_EXT_MAX; +} + +static int kvm_riscv_vcpu_isa_check_host(unsigned long kvm_ext, unsigned long *guest_ext) +{ + unsigned long host_ext; + + if (kvm_ext >= KVM_RISCV_ISA_EXT_MAX || + kvm_ext >= ARRAY_SIZE(kvm_isa_ext_arr)) + return -ENOENT; + + *guest_ext = kvm_isa_ext_arr[kvm_ext]; + switch (*guest_ext) { + case RISCV_ISA_EXT_SMNPM: + /* + * Pointer masking effective in (H)S-mode is provided by the + * Smnpm extension, so that extension is reported to the guest, + * even though the CSR bits for configuring VS-mode pointer + * masking on the host side are part of the Ssnpm extension. + */ + host_ext = RISCV_ISA_EXT_SSNPM; + break; + default: + host_ext = *guest_ext; + break; + } + + if (!__riscv_isa_extension_available(NULL, host_ext)) + return -ENOENT; + + return 0; +} + +static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext) +{ + switch (ext) { + case KVM_RISCV_ISA_EXT_H: + return false; + case KVM_RISCV_ISA_EXT_SSCOFPMF: + /* Sscofpmf depends on interrupt filtering defined in ssaia */ + return __riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSAIA); + case KVM_RISCV_ISA_EXT_SVADU: + /* + * The henvcfg.ADUE is read-only zero if menvcfg.ADUE is zero. + * Guest OS can use Svadu only when host OS enable Svadu. + */ + return arch_has_hw_pte_young(); + case KVM_RISCV_ISA_EXT_V: + return riscv_v_vstate_ctrl_user_allowed(); + default: + break; + } + + return true; +} + +static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) +{ + switch (ext) { + /* Extensions which don't have any mechanism to disable */ + case KVM_RISCV_ISA_EXT_A: + case KVM_RISCV_ISA_EXT_C: + case KVM_RISCV_ISA_EXT_I: + case KVM_RISCV_ISA_EXT_M: + /* There is not architectural config bit to disable sscofpmf completely */ + case KVM_RISCV_ISA_EXT_SSCOFPMF: + case KVM_RISCV_ISA_EXT_SSNPM: + case KVM_RISCV_ISA_EXT_SSTC: + case KVM_RISCV_ISA_EXT_SVINVAL: + case KVM_RISCV_ISA_EXT_SVNAPOT: + case KVM_RISCV_ISA_EXT_SVVPTC: + case KVM_RISCV_ISA_EXT_ZAAMO: + case KVM_RISCV_ISA_EXT_ZABHA: + case KVM_RISCV_ISA_EXT_ZACAS: + case KVM_RISCV_ISA_EXT_ZALRSC: + case KVM_RISCV_ISA_EXT_ZAWRS: + case KVM_RISCV_ISA_EXT_ZBA: + case KVM_RISCV_ISA_EXT_ZBB: + case KVM_RISCV_ISA_EXT_ZBC: + case KVM_RISCV_ISA_EXT_ZBKB: + case KVM_RISCV_ISA_EXT_ZBKC: + case KVM_RISCV_ISA_EXT_ZBKX: + case KVM_RISCV_ISA_EXT_ZBS: + case KVM_RISCV_ISA_EXT_ZCA: + case KVM_RISCV_ISA_EXT_ZCB: + case KVM_RISCV_ISA_EXT_ZCD: + case KVM_RISCV_ISA_EXT_ZCF: + case KVM_RISCV_ISA_EXT_ZCMOP: + case KVM_RISCV_ISA_EXT_ZFA: + case KVM_RISCV_ISA_EXT_ZFBFMIN: + case KVM_RISCV_ISA_EXT_ZFH: + case KVM_RISCV_ISA_EXT_ZFHMIN: + case KVM_RISCV_ISA_EXT_ZICBOP: + case KVM_RISCV_ISA_EXT_ZICCRSE: + case KVM_RISCV_ISA_EXT_ZICNTR: + case KVM_RISCV_ISA_EXT_ZICOND: + case KVM_RISCV_ISA_EXT_ZICSR: + case KVM_RISCV_ISA_EXT_ZIFENCEI: + case KVM_RISCV_ISA_EXT_ZIHINTNTL: + case KVM_RISCV_ISA_EXT_ZIHINTPAUSE: + case KVM_RISCV_ISA_EXT_ZIHPM: + case KVM_RISCV_ISA_EXT_ZIMOP: + case KVM_RISCV_ISA_EXT_ZKND: + case KVM_RISCV_ISA_EXT_ZKNE: + case KVM_RISCV_ISA_EXT_ZKNH: + case KVM_RISCV_ISA_EXT_ZKR: + case KVM_RISCV_ISA_EXT_ZKSED: + case KVM_RISCV_ISA_EXT_ZKSH: + case KVM_RISCV_ISA_EXT_ZKT: + case KVM_RISCV_ISA_EXT_ZTSO: + case KVM_RISCV_ISA_EXT_ZVBB: + case KVM_RISCV_ISA_EXT_ZVBC: + case KVM_RISCV_ISA_EXT_ZVFBFMIN: + case KVM_RISCV_ISA_EXT_ZVFBFWMA: + case KVM_RISCV_ISA_EXT_ZVFH: + case KVM_RISCV_ISA_EXT_ZVFHMIN: + case KVM_RISCV_ISA_EXT_ZVKB: + case KVM_RISCV_ISA_EXT_ZVKG: + case KVM_RISCV_ISA_EXT_ZVKNED: + case KVM_RISCV_ISA_EXT_ZVKNHA: + case KVM_RISCV_ISA_EXT_ZVKNHB: + case KVM_RISCV_ISA_EXT_ZVKSED: + case KVM_RISCV_ISA_EXT_ZVKSH: + case KVM_RISCV_ISA_EXT_ZVKT: + return false; + /* Extensions which can be disabled using Smstateen */ + case KVM_RISCV_ISA_EXT_SSAIA: + return riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN); + case KVM_RISCV_ISA_EXT_SVADE: + /* + * The henvcfg.ADUE is read-only zero if menvcfg.ADUE is zero. + * Svade can't be disabled unless we support Svadu. + */ + return arch_has_hw_pte_young(); + default: + break; + } + + return true; +} + +void kvm_riscv_vcpu_setup_isa(struct kvm_vcpu *vcpu) +{ + unsigned long guest_ext, i; + + for (i = 0; i < ARRAY_SIZE(kvm_isa_ext_arr); i++) { + if (kvm_riscv_vcpu_isa_check_host(i, &guest_ext)) + continue; + if (kvm_riscv_vcpu_isa_enable_allowed(i)) + set_bit(guest_ext, vcpu->arch.isa); + } +} + +static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CONFIG); + unsigned long reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + switch (reg_num) { + case KVM_REG_RISCV_CONFIG_REG(isa): + reg_val = vcpu->arch.isa[0] & KVM_RISCV_BASE_ISA_MASK; + break; + case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): + if (!riscv_isa_extension_available(NULL, ZICBOM)) + return -ENOENT; + reg_val = riscv_cbom_block_size; + break; + case KVM_REG_RISCV_CONFIG_REG(zicboz_block_size): + if (!riscv_isa_extension_available(NULL, ZICBOZ)) + return -ENOENT; + reg_val = riscv_cboz_block_size; + break; + case KVM_REG_RISCV_CONFIG_REG(zicbop_block_size): + if (!riscv_isa_extension_available(NULL, ZICBOP)) + return -ENOENT; + reg_val = riscv_cbop_block_size; + break; + case KVM_REG_RISCV_CONFIG_REG(mvendorid): + reg_val = vcpu->arch.mvendorid; + break; + case KVM_REG_RISCV_CONFIG_REG(marchid): + reg_val = vcpu->arch.marchid; + break; + case KVM_REG_RISCV_CONFIG_REG(mimpid): + reg_val = vcpu->arch.mimpid; + break; + case KVM_REG_RISCV_CONFIG_REG(satp_mode): + reg_val = satp_mode >> SATP_MODE_SHIFT; + break; + default: + return -ENOENT; + } + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CONFIG); + unsigned long i, isa_ext, reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg_num) { + case KVM_REG_RISCV_CONFIG_REG(isa): + /* + * This ONE REG interface is only defined for + * single letter extensions. + */ + if (fls(reg_val) >= RISCV_ISA_EXT_BASE) + return -EINVAL; + + /* + * Return early (i.e. do nothing) if reg_val is the same + * value retrievable via kvm_riscv_vcpu_get_reg_config(). + */ + if (reg_val == (vcpu->arch.isa[0] & KVM_RISCV_BASE_ISA_MASK)) + break; + + if (!vcpu->arch.ran_atleast_once) { + /* Ignore the enable/disable request for certain extensions */ + for (i = 0; i < RISCV_ISA_EXT_BASE; i++) { + isa_ext = kvm_riscv_vcpu_base2isa_ext(i); + if (isa_ext >= KVM_RISCV_ISA_EXT_MAX) { + reg_val &= ~BIT(i); + continue; + } + if (!kvm_riscv_vcpu_isa_enable_allowed(isa_ext)) + if (reg_val & BIT(i)) + reg_val &= ~BIT(i); + if (!kvm_riscv_vcpu_isa_disable_allowed(isa_ext)) + if (!(reg_val & BIT(i))) + reg_val |= BIT(i); + } + reg_val &= riscv_isa_extension_base(NULL); + /* Do not modify anything beyond single letter extensions */ + reg_val = (vcpu->arch.isa[0] & ~KVM_RISCV_BASE_ISA_MASK) | + (reg_val & KVM_RISCV_BASE_ISA_MASK); + vcpu->arch.isa[0] = reg_val; + kvm_riscv_vcpu_fp_reset(vcpu); + } else { + return -EBUSY; + } + break; + case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): + if (!riscv_isa_extension_available(NULL, ZICBOM)) + return -ENOENT; + if (reg_val != riscv_cbom_block_size) + return -EINVAL; + break; + case KVM_REG_RISCV_CONFIG_REG(zicboz_block_size): + if (!riscv_isa_extension_available(NULL, ZICBOZ)) + return -ENOENT; + if (reg_val != riscv_cboz_block_size) + return -EINVAL; + break; + case KVM_REG_RISCV_CONFIG_REG(zicbop_block_size): + if (!riscv_isa_extension_available(NULL, ZICBOP)) + return -ENOENT; + if (reg_val != riscv_cbop_block_size) + return -EINVAL; + break; + case KVM_REG_RISCV_CONFIG_REG(mvendorid): + if (reg_val == vcpu->arch.mvendorid) + break; + if (!vcpu->arch.ran_atleast_once) + vcpu->arch.mvendorid = reg_val; + else + return -EBUSY; + break; + case KVM_REG_RISCV_CONFIG_REG(marchid): + if (reg_val == vcpu->arch.marchid) + break; + if (!vcpu->arch.ran_atleast_once) + vcpu->arch.marchid = reg_val; + else + return -EBUSY; + break; + case KVM_REG_RISCV_CONFIG_REG(mimpid): + if (reg_val == vcpu->arch.mimpid) + break; + if (!vcpu->arch.ran_atleast_once) + vcpu->arch.mimpid = reg_val; + else + return -EBUSY; + break; + case KVM_REG_RISCV_CONFIG_REG(satp_mode): + if (reg_val != (satp_mode >> SATP_MODE_SHIFT)) + return -EINVAL; + break; + default: + return -ENOENT; + } + + return 0; +} + +static int kvm_riscv_vcpu_get_reg_core(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CORE); + unsigned long reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + if (reg_num >= sizeof(struct kvm_riscv_core) / sizeof(unsigned long)) + return -ENOENT; + + if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc)) + reg_val = cntx->sepc; + else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num && + reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6)) + reg_val = ((unsigned long *)cntx)[reg_num]; + else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) + reg_val = (cntx->sstatus & SR_SPP) ? + KVM_RISCV_MODE_S : KVM_RISCV_MODE_U; + else + return -ENOENT; + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_riscv_vcpu_set_reg_core(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CORE); + unsigned long reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + if (reg_num >= sizeof(struct kvm_riscv_core) / sizeof(unsigned long)) + return -ENOENT; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc)) + cntx->sepc = reg_val; + else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num && + reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6)) + ((unsigned long *)cntx)[reg_num] = reg_val; + else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) { + if (reg_val == KVM_RISCV_MODE_S) + cntx->sstatus |= SR_SPP; + else + cntx->sstatus &= ~SR_SPP; + } else + return -ENOENT; + + return 0; +} + +static int kvm_riscv_vcpu_general_get_csr(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long *out_val) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + + if (reg_num >= sizeof(struct kvm_riscv_csr) / sizeof(unsigned long)) + return -ENOENT; + + if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) { + kvm_riscv_vcpu_flush_interrupts(vcpu); + *out_val = (csr->hvip >> VSIP_TO_HVIP_SHIFT) & VSIP_VALID_MASK; + *out_val |= csr->hvip & ~IRQ_LOCAL_MASK; + } else + *out_val = ((unsigned long *)csr)[reg_num]; + + return 0; +} + +static int kvm_riscv_vcpu_general_set_csr(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long reg_val) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + + if (reg_num >= sizeof(struct kvm_riscv_csr) / sizeof(unsigned long)) + return -ENOENT; + + if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) { + reg_val &= VSIP_VALID_MASK; + reg_val <<= VSIP_TO_HVIP_SHIFT; + } + + ((unsigned long *)csr)[reg_num] = reg_val; + + if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) + WRITE_ONCE(vcpu->arch.irqs_pending_mask[0], 0); + + return 0; +} + +static inline int kvm_riscv_vcpu_smstateen_set_csr(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long reg_val) +{ + struct kvm_vcpu_smstateen_csr *csr = &vcpu->arch.smstateen_csr; + + if (reg_num >= sizeof(struct kvm_riscv_smstateen_csr) / + sizeof(unsigned long)) + return -EINVAL; + + ((unsigned long *)csr)[reg_num] = reg_val; + return 0; +} + +static int kvm_riscv_vcpu_smstateen_get_csr(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long *out_val) +{ + struct kvm_vcpu_smstateen_csr *csr = &vcpu->arch.smstateen_csr; + + if (reg_num >= sizeof(struct kvm_riscv_smstateen_csr) / + sizeof(unsigned long)) + return -EINVAL; + + *out_val = ((unsigned long *)csr)[reg_num]; + return 0; +} + +static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + int rc; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CSR); + unsigned long reg_val, reg_subtype; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + switch (reg_subtype) { + case KVM_REG_RISCV_CSR_GENERAL: + rc = kvm_riscv_vcpu_general_get_csr(vcpu, reg_num, ®_val); + break; + case KVM_REG_RISCV_CSR_AIA: + rc = kvm_riscv_vcpu_aia_get_csr(vcpu, reg_num, ®_val); + break; + case KVM_REG_RISCV_CSR_SMSTATEEN: + rc = -EINVAL; + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) + rc = kvm_riscv_vcpu_smstateen_get_csr(vcpu, reg_num, + ®_val); + break; + default: + rc = -ENOENT; + break; + } + if (rc) + return rc; + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + int rc; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CSR); + unsigned long reg_val, reg_subtype; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + switch (reg_subtype) { + case KVM_REG_RISCV_CSR_GENERAL: + rc = kvm_riscv_vcpu_general_set_csr(vcpu, reg_num, reg_val); + break; + case KVM_REG_RISCV_CSR_AIA: + rc = kvm_riscv_vcpu_aia_set_csr(vcpu, reg_num, reg_val); + break; + case KVM_REG_RISCV_CSR_SMSTATEEN: + rc = -EINVAL; + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) + rc = kvm_riscv_vcpu_smstateen_set_csr(vcpu, reg_num, + reg_val); + break; + default: + rc = -ENOENT; + break; + } + if (rc) + return rc; + + return 0; +} + +static int riscv_vcpu_get_isa_ext_single(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long *reg_val) +{ + unsigned long guest_ext; + int ret; + + ret = kvm_riscv_vcpu_isa_check_host(reg_num, &guest_ext); + if (ret) + return ret; + + *reg_val = 0; + if (__riscv_isa_extension_available(vcpu->arch.isa, guest_ext)) + *reg_val = 1; /* Mark the given extension as available */ + + return 0; +} + +static int riscv_vcpu_set_isa_ext_single(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long reg_val) +{ + unsigned long guest_ext; + int ret; + + ret = kvm_riscv_vcpu_isa_check_host(reg_num, &guest_ext); + if (ret) + return ret; + + if (reg_val == test_bit(guest_ext, vcpu->arch.isa)) + return 0; + + if (!vcpu->arch.ran_atleast_once) { + /* + * All multi-letter extension and a few single letter + * extension can be disabled + */ + if (reg_val == 1 && + kvm_riscv_vcpu_isa_enable_allowed(reg_num)) + set_bit(guest_ext, vcpu->arch.isa); + else if (!reg_val && + kvm_riscv_vcpu_isa_disable_allowed(reg_num)) + clear_bit(guest_ext, vcpu->arch.isa); + else + return -EINVAL; + kvm_riscv_vcpu_fp_reset(vcpu); + } else { + return -EBUSY; + } + + return 0; +} + +static int riscv_vcpu_get_isa_ext_multi(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long *reg_val) +{ + unsigned long i, ext_id, ext_val; + + if (reg_num > KVM_REG_RISCV_ISA_MULTI_REG_LAST) + return -ENOENT; + + for (i = 0; i < BITS_PER_LONG; i++) { + ext_id = i + reg_num * BITS_PER_LONG; + if (ext_id >= KVM_RISCV_ISA_EXT_MAX) + break; + + ext_val = 0; + riscv_vcpu_get_isa_ext_single(vcpu, ext_id, &ext_val); + if (ext_val) + *reg_val |= KVM_REG_RISCV_ISA_MULTI_MASK(ext_id); + } + + return 0; +} + +static int riscv_vcpu_set_isa_ext_multi(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long reg_val, bool enable) +{ + unsigned long i, ext_id; + + if (reg_num > KVM_REG_RISCV_ISA_MULTI_REG_LAST) + return -ENOENT; + + for_each_set_bit(i, ®_val, BITS_PER_LONG) { + ext_id = i + reg_num * BITS_PER_LONG; + if (ext_id >= KVM_RISCV_ISA_EXT_MAX) + break; + + riscv_vcpu_set_isa_ext_single(vcpu, ext_id, enable); + } + + return 0; +} + +static int kvm_riscv_vcpu_get_reg_isa_ext(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + int rc; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_ISA_EXT); + unsigned long reg_val, reg_subtype; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + reg_val = 0; + switch (reg_subtype) { + case KVM_REG_RISCV_ISA_SINGLE: + rc = riscv_vcpu_get_isa_ext_single(vcpu, reg_num, ®_val); + break; + case KVM_REG_RISCV_ISA_MULTI_EN: + case KVM_REG_RISCV_ISA_MULTI_DIS: + rc = riscv_vcpu_get_isa_ext_multi(vcpu, reg_num, ®_val); + if (!rc && reg_subtype == KVM_REG_RISCV_ISA_MULTI_DIS) + reg_val = ~reg_val; + break; + default: + rc = -ENOENT; + } + if (rc) + return rc; + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_ISA_EXT); + unsigned long reg_val, reg_subtype; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg_subtype) { + case KVM_REG_RISCV_ISA_SINGLE: + return riscv_vcpu_set_isa_ext_single(vcpu, reg_num, reg_val); + case KVM_REG_RISCV_ISA_MULTI_EN: + return riscv_vcpu_set_isa_ext_multi(vcpu, reg_num, reg_val, true); + case KVM_REG_RISCV_ISA_MULTI_DIS: + return riscv_vcpu_set_isa_ext_multi(vcpu, reg_num, reg_val, false); + default: + return -ENOENT; + } + + return 0; +} + +static int copy_config_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + int n = 0; + + for (int i = 0; i < sizeof(struct kvm_riscv_config)/sizeof(unsigned long); + i++) { + u64 size; + u64 reg; + + /* + * Avoid reporting config reg if the corresponding extension + * was not available. + */ + if (i == KVM_REG_RISCV_CONFIG_REG(zicbom_block_size) && + !riscv_isa_extension_available(NULL, ZICBOM)) + continue; + else if (i == KVM_REG_RISCV_CONFIG_REG(zicboz_block_size) && + !riscv_isa_extension_available(NULL, ZICBOZ)) + continue; + else if (i == KVM_REG_RISCV_CONFIG_REG(zicbop_block_size) && + !riscv_isa_extension_available(NULL, ZICBOP)) + continue; + + size = IS_ENABLED(CONFIG_32BIT) ? KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CONFIG | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + + n++; + } + + return n; +} + +static unsigned long num_config_regs(const struct kvm_vcpu *vcpu) +{ + return copy_config_reg_indices(vcpu, NULL); +} + +static inline unsigned long num_core_regs(void) +{ + return sizeof(struct kvm_riscv_core) / sizeof(unsigned long); +} + +static int copy_core_reg_indices(u64 __user *uindices) +{ + int n = num_core_regs(); + + for (int i = 0; i < n; i++) { + u64 size = IS_ENABLED(CONFIG_32BIT) ? + KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CORE | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + return n; +} + +static inline unsigned long num_csr_regs(const struct kvm_vcpu *vcpu) +{ + unsigned long n = sizeof(struct kvm_riscv_csr) / sizeof(unsigned long); + + if (riscv_isa_extension_available(vcpu->arch.isa, SSAIA)) + n += sizeof(struct kvm_riscv_aia_csr) / sizeof(unsigned long); + if (riscv_isa_extension_available(vcpu->arch.isa, SMSTATEEN)) + n += sizeof(struct kvm_riscv_smstateen_csr) / sizeof(unsigned long); + + return n; +} + +static int copy_csr_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + int n1 = sizeof(struct kvm_riscv_csr) / sizeof(unsigned long); + int n2 = 0, n3 = 0; + + /* copy general csr regs */ + for (int i = 0; i < n1; i++) { + u64 size = IS_ENABLED(CONFIG_32BIT) ? + KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CSR | + KVM_REG_RISCV_CSR_GENERAL | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + /* copy AIA csr regs */ + if (riscv_isa_extension_available(vcpu->arch.isa, SSAIA)) { + n2 = sizeof(struct kvm_riscv_aia_csr) / sizeof(unsigned long); + + for (int i = 0; i < n2; i++) { + u64 size = IS_ENABLED(CONFIG_32BIT) ? + KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CSR | + KVM_REG_RISCV_CSR_AIA | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + } + + /* copy Smstateen csr regs */ + if (riscv_isa_extension_available(vcpu->arch.isa, SMSTATEEN)) { + n3 = sizeof(struct kvm_riscv_smstateen_csr) / sizeof(unsigned long); + + for (int i = 0; i < n3; i++) { + u64 size = IS_ENABLED(CONFIG_32BIT) ? + KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CSR | + KVM_REG_RISCV_CSR_SMSTATEEN | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + } + + return n1 + n2 + n3; +} + +static inline unsigned long num_timer_regs(void) +{ + return sizeof(struct kvm_riscv_timer) / sizeof(u64); +} + +static int copy_timer_reg_indices(u64 __user *uindices) +{ + int n = num_timer_regs(); + + for (int i = 0; i < n; i++) { + u64 reg = KVM_REG_RISCV | KVM_REG_SIZE_U64 | + KVM_REG_RISCV_TIMER | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + return n; +} + +static inline unsigned long num_fp_f_regs(const struct kvm_vcpu *vcpu) +{ + const struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + + if (riscv_isa_extension_available(vcpu->arch.isa, f)) + return sizeof(cntx->fp.f) / sizeof(u32); + else + return 0; +} + +static int copy_fp_f_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + int n = num_fp_f_regs(vcpu); + + for (int i = 0; i < n; i++) { + u64 reg = KVM_REG_RISCV | KVM_REG_SIZE_U32 | + KVM_REG_RISCV_FP_F | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + return n; +} + +static inline unsigned long num_fp_d_regs(const struct kvm_vcpu *vcpu) +{ + const struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + + if (riscv_isa_extension_available(vcpu->arch.isa, d)) + return sizeof(cntx->fp.d.f) / sizeof(u64) + 1; + else + return 0; +} + +static int copy_fp_d_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + int i; + int n = num_fp_d_regs(vcpu); + u64 reg; + + /* copy fp.d.f indices */ + for (i = 0; i < n-1; i++) { + reg = KVM_REG_RISCV | KVM_REG_SIZE_U64 | + KVM_REG_RISCV_FP_D | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + /* copy fp.d.fcsr indices */ + reg = KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_D | i; + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + + return n; +} + +static int copy_isa_ext_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + unsigned long guest_ext; + unsigned int n = 0; + + for (int i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) { + u64 size = IS_ENABLED(CONFIG_32BIT) ? + KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_ISA_EXT | i; + + if (kvm_riscv_vcpu_isa_check_host(i, &guest_ext)) + continue; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + + n++; + } + + return n; +} + +static inline unsigned long num_isa_ext_regs(const struct kvm_vcpu *vcpu) +{ + return copy_isa_ext_reg_indices(vcpu, NULL); +} + +static unsigned long num_sbi_ext_regs(struct kvm_vcpu *vcpu) +{ + return kvm_riscv_vcpu_reg_indices_sbi_ext(vcpu, NULL); +} + +static inline unsigned long num_sbi_regs(struct kvm_vcpu *vcpu) +{ + return kvm_riscv_vcpu_reg_indices_sbi(vcpu, NULL); +} + +static inline unsigned long num_vector_regs(const struct kvm_vcpu *vcpu) +{ + if (!riscv_isa_extension_available(vcpu->arch.isa, v)) + return 0; + + /* vstart, vl, vtype, vcsr, vlenb and 32 vector regs */ + return 37; +} + +static int copy_vector_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + const struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + int n = num_vector_regs(vcpu); + u64 reg, size; + int i; + + if (n == 0) + return 0; + + /* copy vstart, vl, vtype, vcsr and vlenb */ + size = IS_ENABLED(CONFIG_32BIT) ? KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + for (i = 0; i < 5; i++) { + reg = KVM_REG_RISCV | size | KVM_REG_RISCV_VECTOR | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + /* vector_regs have a variable 'vlenb' size */ + size = __builtin_ctzl(cntx->vector.vlenb); + size <<= KVM_REG_SIZE_SHIFT; + for (i = 0; i < 32; i++) { + reg = KVM_REG_RISCV | KVM_REG_RISCV_VECTOR | size | + KVM_REG_RISCV_VECTOR_REG(i); + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + return n; +} + +/* + * kvm_riscv_vcpu_num_regs - how many registers do we present via KVM_GET/SET_ONE_REG + * + * This is for all registers. + */ +unsigned long kvm_riscv_vcpu_num_regs(struct kvm_vcpu *vcpu) +{ + unsigned long res = 0; + + res += num_config_regs(vcpu); + res += num_core_regs(); + res += num_csr_regs(vcpu); + res += num_timer_regs(); + res += num_fp_f_regs(vcpu); + res += num_fp_d_regs(vcpu); + res += num_vector_regs(vcpu); + res += num_isa_ext_regs(vcpu); + res += num_sbi_ext_regs(vcpu); + res += num_sbi_regs(vcpu); + + return res; +} + +/* + * kvm_riscv_vcpu_copy_reg_indices - get indices of all registers. + */ +int kvm_riscv_vcpu_copy_reg_indices(struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + int ret; + + ret = copy_config_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = copy_core_reg_indices(uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = copy_csr_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = copy_timer_reg_indices(uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = copy_fp_f_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = copy_fp_d_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = copy_vector_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = copy_isa_ext_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = kvm_riscv_vcpu_reg_indices_sbi_ext(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = kvm_riscv_vcpu_reg_indices_sbi(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + return 0; +} + +int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + switch (reg->id & KVM_REG_RISCV_TYPE_MASK) { + case KVM_REG_RISCV_CONFIG: + return kvm_riscv_vcpu_set_reg_config(vcpu, reg); + case KVM_REG_RISCV_CORE: + return kvm_riscv_vcpu_set_reg_core(vcpu, reg); + case KVM_REG_RISCV_CSR: + return kvm_riscv_vcpu_set_reg_csr(vcpu, reg); + case KVM_REG_RISCV_TIMER: + return kvm_riscv_vcpu_set_reg_timer(vcpu, reg); + case KVM_REG_RISCV_FP_F: + return kvm_riscv_vcpu_set_reg_fp(vcpu, reg, + KVM_REG_RISCV_FP_F); + case KVM_REG_RISCV_FP_D: + return kvm_riscv_vcpu_set_reg_fp(vcpu, reg, + KVM_REG_RISCV_FP_D); + case KVM_REG_RISCV_VECTOR: + return kvm_riscv_vcpu_set_reg_vector(vcpu, reg); + case KVM_REG_RISCV_ISA_EXT: + return kvm_riscv_vcpu_set_reg_isa_ext(vcpu, reg); + case KVM_REG_RISCV_SBI_EXT: + return kvm_riscv_vcpu_set_reg_sbi_ext(vcpu, reg); + case KVM_REG_RISCV_SBI_STATE: + return kvm_riscv_vcpu_set_reg_sbi(vcpu, reg); + default: + break; + } + + return -ENOENT; +} + +int kvm_riscv_vcpu_get_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + switch (reg->id & KVM_REG_RISCV_TYPE_MASK) { + case KVM_REG_RISCV_CONFIG: + return kvm_riscv_vcpu_get_reg_config(vcpu, reg); + case KVM_REG_RISCV_CORE: + return kvm_riscv_vcpu_get_reg_core(vcpu, reg); + case KVM_REG_RISCV_CSR: + return kvm_riscv_vcpu_get_reg_csr(vcpu, reg); + case KVM_REG_RISCV_TIMER: + return kvm_riscv_vcpu_get_reg_timer(vcpu, reg); + case KVM_REG_RISCV_FP_F: + return kvm_riscv_vcpu_get_reg_fp(vcpu, reg, + KVM_REG_RISCV_FP_F); + case KVM_REG_RISCV_FP_D: + return kvm_riscv_vcpu_get_reg_fp(vcpu, reg, + KVM_REG_RISCV_FP_D); + case KVM_REG_RISCV_VECTOR: + return kvm_riscv_vcpu_get_reg_vector(vcpu, reg); + case KVM_REG_RISCV_ISA_EXT: + return kvm_riscv_vcpu_get_reg_isa_ext(vcpu, reg); + case KVM_REG_RISCV_SBI_EXT: + return kvm_riscv_vcpu_get_reg_sbi_ext(vcpu, reg); + case KVM_REG_RISCV_SBI_STATE: + return kvm_riscv_vcpu_get_reg_sbi(vcpu, reg); + default: + break; + } + + return -ENOENT; +} diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c new file mode 100644 index 000000000000..a2fae70ee174 --- /dev/null +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -0,0 +1,906 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Rivos Inc + * + * Authors: + * Atish Patra <atishp@rivosinc.com> + */ + +#define pr_fmt(fmt) "riscv-kvm-pmu: " fmt +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/perf/riscv_pmu.h> +#include <asm/csr.h> +#include <asm/kvm_vcpu_sbi.h> +#include <asm/kvm_vcpu_pmu.h> +#include <asm/sbi.h> +#include <linux/bitops.h> + +#define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs) +#define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16) +#define get_event_code(x) ((x) & SBI_PMU_EVENT_IDX_CODE_MASK) + +static enum perf_hw_id hw_event_perf_map[SBI_PMU_HW_GENERAL_MAX] = { + [SBI_PMU_HW_CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES, + [SBI_PMU_HW_INSTRUCTIONS] = PERF_COUNT_HW_INSTRUCTIONS, + [SBI_PMU_HW_CACHE_REFERENCES] = PERF_COUNT_HW_CACHE_REFERENCES, + [SBI_PMU_HW_CACHE_MISSES] = PERF_COUNT_HW_CACHE_MISSES, + [SBI_PMU_HW_BRANCH_INSTRUCTIONS] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + [SBI_PMU_HW_BRANCH_MISSES] = PERF_COUNT_HW_BRANCH_MISSES, + [SBI_PMU_HW_BUS_CYCLES] = PERF_COUNT_HW_BUS_CYCLES, + [SBI_PMU_HW_STALLED_CYCLES_FRONTEND] = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, + [SBI_PMU_HW_STALLED_CYCLES_BACKEND] = PERF_COUNT_HW_STALLED_CYCLES_BACKEND, + [SBI_PMU_HW_REF_CPU_CYCLES] = PERF_COUNT_HW_REF_CPU_CYCLES, +}; + +static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc) +{ + u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0); + u64 sample_period; + + if (!pmc->counter_val) + sample_period = counter_val_mask; + else + sample_period = (-pmc->counter_val) & counter_val_mask; + + return sample_period; +} + +static u32 kvm_pmu_get_perf_event_type(unsigned long eidx) +{ + enum sbi_pmu_event_type etype = get_event_type(eidx); + u32 type = PERF_TYPE_MAX; + + switch (etype) { + case SBI_PMU_EVENT_TYPE_HW: + type = PERF_TYPE_HARDWARE; + break; + case SBI_PMU_EVENT_TYPE_CACHE: + type = PERF_TYPE_HW_CACHE; + break; + case SBI_PMU_EVENT_TYPE_RAW: + case SBI_PMU_EVENT_TYPE_RAW_V2: + case SBI_PMU_EVENT_TYPE_FW: + type = PERF_TYPE_RAW; + break; + default: + break; + } + + return type; +} + +static bool kvm_pmu_is_fw_event(unsigned long eidx) +{ + return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW; +} + +static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + perf_event_disable(pmc->perf_event); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +static u64 kvm_pmu_get_perf_event_hw_config(u32 sbi_event_code) +{ + return hw_event_perf_map[sbi_event_code]; +} + +static u64 kvm_pmu_get_perf_event_cache_config(u32 sbi_event_code) +{ + u64 config = U64_MAX; + unsigned int cache_type, cache_op, cache_result; + + /* All the cache event masks lie within 0xFF. No separate masking is necessary */ + cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >> + SBI_PMU_EVENT_CACHE_ID_SHIFT; + cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >> + SBI_PMU_EVENT_CACHE_OP_SHIFT; + cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK; + + if (cache_type >= PERF_COUNT_HW_CACHE_MAX || + cache_op >= PERF_COUNT_HW_CACHE_OP_MAX || + cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return config; + + config = cache_type | (cache_op << 8) | (cache_result << 16); + + return config; +} + +static u64 kvm_pmu_get_perf_event_config(unsigned long eidx, uint64_t evt_data) +{ + enum sbi_pmu_event_type etype = get_event_type(eidx); + u32 ecode = get_event_code(eidx); + u64 config = U64_MAX; + + switch (etype) { + case SBI_PMU_EVENT_TYPE_HW: + if (ecode < SBI_PMU_HW_GENERAL_MAX) + config = kvm_pmu_get_perf_event_hw_config(ecode); + break; + case SBI_PMU_EVENT_TYPE_CACHE: + config = kvm_pmu_get_perf_event_cache_config(ecode); + break; + case SBI_PMU_EVENT_TYPE_RAW: + config = evt_data & RISCV_PMU_RAW_EVENT_MASK; + break; + case SBI_PMU_EVENT_TYPE_RAW_V2: + config = evt_data & RISCV_PMU_RAW_EVENT_V2_MASK; + break; + case SBI_PMU_EVENT_TYPE_FW: + if (ecode < SBI_PMU_FW_MAX) + config = (1ULL << 63) | ecode; + break; + default: + break; + } + + return config; +} + +static int kvm_pmu_get_fixed_pmc_index(unsigned long eidx) +{ + u32 etype = kvm_pmu_get_perf_event_type(eidx); + u32 ecode = get_event_code(eidx); + + if (etype != SBI_PMU_EVENT_TYPE_HW) + return -EINVAL; + + if (ecode == SBI_PMU_HW_CPU_CYCLES) + return 0; + else if (ecode == SBI_PMU_HW_INSTRUCTIONS) + return 2; + else + return -EINVAL; +} + +static int kvm_pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx, + unsigned long cbase, unsigned long cmask) +{ + int ctr_idx = -1; + int i, pmc_idx; + int min, max; + + if (kvm_pmu_is_fw_event(eidx)) { + /* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */ + min = kvpmu->num_hw_ctrs; + max = min + kvpmu->num_fw_ctrs; + } else { + /* First 3 counters are reserved for fixed counters */ + min = 3; + max = kvpmu->num_hw_ctrs; + } + + for_each_set_bit(i, &cmask, BITS_PER_LONG) { + pmc_idx = i + cbase; + if ((pmc_idx >= min && pmc_idx < max) && + !test_bit(pmc_idx, kvpmu->pmc_in_use)) { + ctr_idx = pmc_idx; + break; + } + } + + return ctr_idx; +} + +static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx, + unsigned long cbase, unsigned long cmask) +{ + int ret; + + /* Fixed counters need to be have fixed mapping as they have different width */ + ret = kvm_pmu_get_fixed_pmc_index(eidx); + if (ret >= 0) + return ret; + + return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask); +} + +static int pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx, + unsigned long *out_val) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int fevent_code; + + if (!IS_ENABLED(CONFIG_32BIT)) { + pr_warn("%s: should be invoked for only RV32\n", __func__); + return -EINVAL; + } + + if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) { + pr_warn("Invalid counter id [%ld]during read\n", cidx); + return -EINVAL; + } + + pmc = &kvpmu->pmc[cidx]; + + if (pmc->cinfo.type != SBI_PMU_CTR_TYPE_FW) + return -EINVAL; + + fevent_code = get_event_code(pmc->event_idx); + pmc->counter_val = kvpmu->fw_event[fevent_code].value; + + *out_val = pmc->counter_val >> 32; + + return 0; +} + +static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, + unsigned long *out_val) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u64 enabled, running; + int fevent_code; + + if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) { + pr_warn("Invalid counter id [%ld] during read\n", cidx); + return -EINVAL; + } + + pmc = &kvpmu->pmc[cidx]; + + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + fevent_code = get_event_code(pmc->event_idx); + pmc->counter_val = kvpmu->fw_event[fevent_code].value; + } else if (pmc->perf_event) { + pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running); + } else { + return -EINVAL; + } + *out_val = pmc->counter_val; + + return 0; +} + +static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base, + unsigned long ctr_mask) +{ + /* Make sure the we have a valid counter mask requested from the caller */ + if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu))) + return -EINVAL; + + return 0; +} + +static void kvm_riscv_pmu_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct kvm_vcpu *vcpu = pmc->vcpu; + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct riscv_pmu *rpmu = to_riscv_pmu(perf_event->pmu); + u64 period; + + /* + * Stop the event counting by directly accessing the perf_event. + * Otherwise, this needs to deferred via a workqueue. + * That will introduce skew in the counter value because the actual + * physical counter would start after returning from this function. + * It will be stopped again once the workqueue is scheduled + */ + rpmu->pmu.stop(perf_event, PERF_EF_UPDATE); + + /* + * The hw counter would start automatically when this function returns. + * Thus, the host may continue to interrupt and inject it to the guest + * even without the guest configuring the next event. Depending on the hardware + * the host may have some sluggishness only if privilege mode filtering is not + * available. In an ideal world, where qemu is not the only capable hardware, + * this can be removed. + * FYI: ARM64 does this way while x86 doesn't do anything as such. + * TODO: Should we keep it for RISC-V ? + */ + period = -(local64_read(&perf_event->count)); + + local64_set(&perf_event->hw.period_left, 0); + perf_event->attr.sample_period = period; + perf_event->hw.sample_period = period; + + set_bit(pmc->idx, kvpmu->pmc_overflown); + kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_PMU_OVF); + + rpmu->pmu.start(perf_event, PERF_EF_RELOAD); +} + +static long kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr, + unsigned long flags, unsigned long eidx, + unsigned long evtdata) +{ + struct perf_event *event; + + kvm_pmu_release_perf_event(pmc); + attr->config = kvm_pmu_get_perf_event_config(eidx, evtdata); + if (flags & SBI_PMU_CFG_FLAG_CLEAR_VALUE) { + //TODO: Do we really want to clear the value in hardware counter + pmc->counter_val = 0; + } + + /* + * Set the default sample_period for now. The guest specified value + * will be updated in the start call. + */ + attr->sample_period = kvm_pmu_get_sample_period(pmc); + + event = perf_event_create_kernel_counter(attr, -1, current, kvm_riscv_pmu_overflow, pmc); + if (IS_ERR(event)) { + pr_debug("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event)); + return PTR_ERR(event); + } + + pmc->perf_event = event; + if (flags & SBI_PMU_CFG_FLAG_AUTO_START) + perf_event_enable(pmc->perf_event); + + return 0; +} + +int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_fw_event *fevent; + + if (!kvpmu || fid >= SBI_PMU_FW_MAX) + return -EINVAL; + + fevent = &kvpmu->fw_event[fid]; + if (fevent->started) + fevent->value++; + + return 0; +} + +int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int cidx, ret = KVM_INSN_CONTINUE_NEXT_SEPC; + + if (!kvpmu || !kvpmu->init_done) { + /* + * In absence of sscofpmf in the platform, the guest OS may use + * the legacy PMU driver to read cycle/instret. In that case, + * just return 0 to avoid any illegal trap. However, any other + * hpmcounter access should result in illegal trap as they must + * be access through SBI PMU only. + */ + if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) { + *val = 0; + return ret; + } else { + return KVM_INSN_ILLEGAL_TRAP; + } + } + + /* The counter CSR are read only. Thus, any write should result in illegal traps */ + if (wr_mask) + return KVM_INSN_ILLEGAL_TRAP; + + cidx = csr_num - CSR_CYCLE; + + if (pmu_ctr_read(vcpu, cidx, val) < 0) + return KVM_INSN_ILLEGAL_TRAP; + + return ret; +} + +static void kvm_pmu_clear_snapshot_area(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + + kfree(kvpmu->sdata); + kvpmu->sdata = NULL; + kvpmu->snapshot_addr = INVALID_GPA; +} + +int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long saddr_low, + unsigned long saddr_high, unsigned long flags, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data); + int sbiret = 0; + gpa_t saddr; + + if (!kvpmu || flags) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + if (saddr_low == SBI_SHMEM_DISABLE && saddr_high == SBI_SHMEM_DISABLE) { + kvm_pmu_clear_snapshot_area(vcpu); + return 0; + } + + saddr = saddr_low; + + if (saddr_high != 0) { + if (IS_ENABLED(CONFIG_32BIT)) + saddr |= ((gpa_t)saddr_high << 32); + else + sbiret = SBI_ERR_INVALID_ADDRESS; + goto out; + } + + kvpmu->sdata = kzalloc(snapshot_area_size, GFP_ATOMIC); + if (!kvpmu->sdata) + return -ENOMEM; + + /* No need to check writable slot explicitly as kvm_vcpu_write_guest does it internally */ + if (kvm_vcpu_write_guest(vcpu, saddr, kvpmu->sdata, snapshot_area_size)) { + kfree(kvpmu->sdata); + sbiret = SBI_ERR_INVALID_ADDRESS; + goto out; + } + + kvpmu->snapshot_addr = saddr; + +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_event_info(struct kvm_vcpu *vcpu, unsigned long saddr_low, + unsigned long saddr_high, unsigned long num_events, + unsigned long flags, struct kvm_vcpu_sbi_return *retdata) +{ + struct riscv_pmu_event_info *einfo = NULL; + int shmem_size = num_events * sizeof(*einfo); + gpa_t shmem; + u32 eidx, etype; + u64 econfig; + int ret; + + if (flags != 0 || (saddr_low & (SZ_16 - 1) || num_events == 0)) { + ret = SBI_ERR_INVALID_PARAM; + goto out; + } + + shmem = saddr_low; + if (saddr_high != 0) { + if (IS_ENABLED(CONFIG_32BIT)) { + shmem |= ((gpa_t)saddr_high << 32); + } else { + ret = SBI_ERR_INVALID_ADDRESS; + goto out; + } + } + + einfo = kzalloc(shmem_size, GFP_KERNEL); + if (!einfo) + return -ENOMEM; + + ret = kvm_vcpu_read_guest(vcpu, shmem, einfo, shmem_size); + if (ret) { + ret = SBI_ERR_FAILURE; + goto free_mem; + } + + for (int i = 0; i < num_events; i++) { + eidx = einfo[i].event_idx; + etype = kvm_pmu_get_perf_event_type(eidx); + econfig = kvm_pmu_get_perf_event_config(eidx, einfo[i].event_data); + ret = riscv_pmu_get_event_info(etype, econfig, NULL); + einfo[i].output = (ret > 0) ? 1 : 0; + } + + ret = kvm_vcpu_write_guest(vcpu, shmem, einfo, shmem_size); + if (ret) { + ret = SBI_ERR_INVALID_ADDRESS; + goto free_mem; + } + + ret = 0; +free_mem: + kfree(einfo); +out: + retdata->err_val = ret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + + retdata->out_val = kvm_pmu_num_counters(kvpmu); + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + + if (cidx > RISCV_KVM_MAX_COUNTERS || cidx == 1) { + retdata->err_val = SBI_ERR_INVALID_PARAM; + return 0; + } + + retdata->out_val = kvpmu->pmc[cidx].cinfo.value; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, u64 ival, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int i, pmc_index, sbiret = 0; + struct kvm_pmc *pmc; + int fevent_code; + bool snap_flag_set = flags & SBI_PMU_START_FLAG_INIT_SNAPSHOT; + + if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + if (snap_flag_set) { + if (kvpmu->snapshot_addr == INVALID_GPA) { + sbiret = SBI_ERR_NO_SHMEM; + goto out; + } + if (kvm_vcpu_read_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata, + sizeof(struct riscv_pmu_snapshot_data))) { + pr_warn("Unable to read snapshot shared memory while starting counters\n"); + sbiret = SBI_ERR_FAILURE; + goto out; + } + } + /* Start the counters that have been configured and requested by the guest */ + for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) { + pmc_index = i + ctr_base; + if (!test_bit(pmc_index, kvpmu->pmc_in_use)) + continue; + /* The guest started the counter again. Reset the overflow status */ + clear_bit(pmc_index, kvpmu->pmc_overflown); + pmc = &kvpmu->pmc[pmc_index]; + if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE) { + pmc->counter_val = ival; + } else if (snap_flag_set) { + /* The counter index in the snapshot are relative to the counter base */ + pmc->counter_val = kvpmu->sdata->ctr_values[i]; + } + + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + fevent_code = get_event_code(pmc->event_idx); + if (fevent_code >= SBI_PMU_FW_MAX) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + /* Check if the counter was already started for some reason */ + if (kvpmu->fw_event[fevent_code].started) { + sbiret = SBI_ERR_ALREADY_STARTED; + continue; + } + + kvpmu->fw_event[fevent_code].started = true; + kvpmu->fw_event[fevent_code].value = pmc->counter_val; + } else if (pmc->perf_event) { + if (unlikely(pmc->started)) { + sbiret = SBI_ERR_ALREADY_STARTED; + continue; + } + perf_event_period(pmc->perf_event, kvm_pmu_get_sample_period(pmc)); + perf_event_enable(pmc->perf_event); + pmc->started = true; + } else { + sbiret = SBI_ERR_INVALID_PARAM; + } + } + +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int i, pmc_index, sbiret = 0; + u64 enabled, running; + struct kvm_pmc *pmc; + int fevent_code; + bool snap_flag_set = flags & SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT; + bool shmem_needs_update = false; + + if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + if (snap_flag_set && kvpmu->snapshot_addr == INVALID_GPA) { + sbiret = SBI_ERR_NO_SHMEM; + goto out; + } + + /* Stop the counters that have been configured and requested by the guest */ + for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) { + pmc_index = i + ctr_base; + if (!test_bit(pmc_index, kvpmu->pmc_in_use)) + continue; + pmc = &kvpmu->pmc[pmc_index]; + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + fevent_code = get_event_code(pmc->event_idx); + if (fevent_code >= SBI_PMU_FW_MAX) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + if (!kvpmu->fw_event[fevent_code].started) + sbiret = SBI_ERR_ALREADY_STOPPED; + + kvpmu->fw_event[fevent_code].started = false; + } else if (pmc->perf_event) { + if (pmc->started) { + /* Stop counting the counter */ + perf_event_disable(pmc->perf_event); + pmc->started = false; + } else { + sbiret = SBI_ERR_ALREADY_STOPPED; + } + + if (flags & SBI_PMU_STOP_FLAG_RESET) + /* Release the counter if this is a reset request */ + kvm_pmu_release_perf_event(pmc); + } else { + sbiret = SBI_ERR_INVALID_PARAM; + } + + if (snap_flag_set && !sbiret) { + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) + pmc->counter_val = kvpmu->fw_event[fevent_code].value; + else if (pmc->perf_event) + pmc->counter_val += perf_event_read_value(pmc->perf_event, + &enabled, &running); + /* + * The counter and overflow indicies in the snapshot region are w.r.to + * cbase. Modify the set bit in the counter mask instead of the pmc_index + * which indicates the absolute counter index. + */ + if (test_bit(pmc_index, kvpmu->pmc_overflown)) + kvpmu->sdata->ctr_overflow_mask |= BIT(i); + kvpmu->sdata->ctr_values[i] = pmc->counter_val; + shmem_needs_update = true; + } + + if (flags & SBI_PMU_STOP_FLAG_RESET) { + pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + clear_bit(pmc_index, kvpmu->pmc_in_use); + clear_bit(pmc_index, kvpmu->pmc_overflown); + if (snap_flag_set) { + /* + * Only clear the given counter as the caller is responsible to + * validate both the overflow mask and configured counters. + */ + kvpmu->sdata->ctr_overflow_mask &= ~BIT(i); + shmem_needs_update = true; + } + } + } + + if (shmem_needs_update) + kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata, + sizeof(struct riscv_pmu_snapshot_data)); + +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, + unsigned long eidx, u64 evtdata, + struct kvm_vcpu_sbi_return *retdata) +{ + int ctr_idx, sbiret = 0; + long ret; + bool is_fevent; + unsigned long event_code; + u32 etype = kvm_pmu_get_perf_event_type(eidx); + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = NULL; + struct perf_event_attr attr = { + .type = etype, + .size = sizeof(struct perf_event_attr), + .pinned = true, + .disabled = true, + /* + * It should never reach here if the platform doesn't support the sscofpmf + * extension as mode filtering won't work without it. + */ + .exclude_host = true, + .exclude_hv = true, + .exclude_user = !!(flags & SBI_PMU_CFG_FLAG_SET_UINH), + .exclude_kernel = !!(flags & SBI_PMU_CFG_FLAG_SET_SINH), + .config1 = RISCV_PMU_CONFIG1_GUEST_EVENTS, + }; + + if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + event_code = get_event_code(eidx); + is_fevent = kvm_pmu_is_fw_event(eidx); + if (is_fevent && event_code >= SBI_PMU_FW_MAX) { + sbiret = SBI_ERR_NOT_SUPPORTED; + goto out; + } + + /* + * SKIP_MATCH flag indicates the caller is aware of the assigned counter + * for this event. Just do a sanity check if it already marked used. + */ + if (flags & SBI_PMU_CFG_FLAG_SKIP_MATCH) { + if (!test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use)) { + sbiret = SBI_ERR_FAILURE; + goto out; + } + ctr_idx = ctr_base + __ffs(ctr_mask); + } else { + ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask); + if (ctr_idx < 0) { + sbiret = SBI_ERR_NOT_SUPPORTED; + goto out; + } + } + + pmc = &kvpmu->pmc[ctr_idx]; + pmc->idx = ctr_idx; + + if (is_fevent) { + if (flags & SBI_PMU_CFG_FLAG_AUTO_START) + kvpmu->fw_event[event_code].started = true; + } else { + ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata); + if (ret) { + sbiret = SBI_ERR_NOT_SUPPORTED; + goto out; + } + } + + set_bit(ctr_idx, kvpmu->pmc_in_use); + pmc->event_idx = eidx; + retdata->out_val = ctr_idx; +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata) +{ + int ret; + + ret = pmu_fw_ctr_read_hi(vcpu, cidx, &retdata->out_val); + if (ret == -EINVAL) + retdata->err_val = SBI_ERR_INVALID_PARAM; + + return 0; +} + +int kvm_riscv_vcpu_pmu_fw_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata) +{ + int ret; + + ret = pmu_ctr_read(vcpu, cidx, &retdata->out_val); + if (ret == -EINVAL) + retdata->err_val = SBI_ERR_INVALID_PARAM; + + return 0; +} + +void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) +{ + int i = 0, ret, num_hw_ctrs = 0, hpm_width = 0; + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + /* + * PMU functionality should be only available to guests if privilege mode + * filtering is available in the host. Otherwise, guest will always count + * events while the execution is in hypervisor mode. + */ + if (!riscv_isa_extension_available(NULL, SSCOFPMF)) + return; + + ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs); + if (ret < 0 || !hpm_width || !num_hw_ctrs) + return; + + /* + * Increase the number of hardware counters to offset the time counter. + */ + kvpmu->num_hw_ctrs = num_hw_ctrs + 1; + kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX; + memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event)); + kvpmu->snapshot_addr = INVALID_GPA; + + if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) { + pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA"); + kvpmu->num_hw_ctrs = RISCV_KVM_MAX_HW_CTRS; + } + + /* + * There is no correlation between the logical hardware counter and virtual counters. + * However, we need to encode a hpmcounter CSR in the counter info field so that + * KVM can trap n emulate the read. This works well in the migration use case as + * KVM doesn't care if the actual hpmcounter is available in the hardware or not. + */ + for (i = 0; i < kvm_pmu_num_counters(kvpmu); i++) { + /* TIME CSR shouldn't be read from perf interface */ + if (i == 1) + continue; + pmc = &kvpmu->pmc[i]; + pmc->idx = i; + pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + pmc->vcpu = vcpu; + if (i < kvpmu->num_hw_ctrs) { + pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW; + if (i < 3) + /* CY, IR counters */ + pmc->cinfo.width = 63; + else + pmc->cinfo.width = hpm_width; + /* + * The CSR number doesn't have any relation with the logical + * hardware counters. The CSR numbers are encoded sequentially + * to avoid maintaining a map between the virtual counter + * and CSR number. + */ + pmc->cinfo.csr = CSR_CYCLE + i; + } else { + pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW; + pmc->cinfo.width = 63; + } + } + + kvpmu->init_done = true; +} + +void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + if (!kvpmu) + return; + + for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS) { + pmc = &kvpmu->pmc[i]; + pmc->counter_val = 0; + kvm_pmu_release_perf_event(pmc); + pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + } + bitmap_zero(kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS); + bitmap_zero(kvpmu->pmc_overflown, RISCV_KVM_MAX_COUNTERS); + memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event)); + kvm_pmu_clear_snapshot_area(vcpu); +} + +void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu) +{ + kvm_riscv_vcpu_pmu_deinit(vcpu); +} diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index f96991d230bf..46ab7b989432 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -12,26 +12,6 @@ #include <asm/sbi.h> #include <asm/kvm_vcpu_sbi.h> -static int kvm_linux_err_map_sbi(int err) -{ - switch (err) { - case 0: - return SBI_SUCCESS; - case -EPERM: - return SBI_ERR_DENIED; - case -EINVAL: - return SBI_ERR_INVALID_PARAM; - case -EFAULT: - return SBI_ERR_INVALID_ADDRESS; - case -EOPNOTSUPP: - return SBI_ERR_NOT_SUPPORTED; - case -EALREADY: - return SBI_ERR_ALREADY_AVAILABLE; - default: - return SBI_ERR_FAILURE; - }; -} - #ifndef CONFIG_RISCV_SBI_V01 static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = { .extid_start = -1UL, @@ -40,19 +20,113 @@ static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = { }; #endif -static const struct kvm_vcpu_sbi_extension *sbi_ext[] = { - &vcpu_sbi_ext_v01, - &vcpu_sbi_ext_base, - &vcpu_sbi_ext_time, - &vcpu_sbi_ext_ipi, - &vcpu_sbi_ext_rfence, - &vcpu_sbi_ext_srst, - &vcpu_sbi_ext_hsm, - &vcpu_sbi_ext_experimental, - &vcpu_sbi_ext_vendor, +#ifndef CONFIG_RISCV_PMU_SBI +static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = { + .extid_start = -1UL, + .extid_end = -1UL, + .handler = NULL, +}; +#endif + +struct kvm_riscv_sbi_extension_entry { + enum KVM_RISCV_SBI_EXT_ID ext_idx; + const struct kvm_vcpu_sbi_extension *ext_ptr; +}; + +static const struct kvm_riscv_sbi_extension_entry sbi_ext[] = { + { + .ext_idx = KVM_RISCV_SBI_EXT_V01, + .ext_ptr = &vcpu_sbi_ext_v01, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_MAX, /* Can't be disabled */ + .ext_ptr = &vcpu_sbi_ext_base, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_TIME, + .ext_ptr = &vcpu_sbi_ext_time, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_IPI, + .ext_ptr = &vcpu_sbi_ext_ipi, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_RFENCE, + .ext_ptr = &vcpu_sbi_ext_rfence, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_SRST, + .ext_ptr = &vcpu_sbi_ext_srst, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_HSM, + .ext_ptr = &vcpu_sbi_ext_hsm, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_PMU, + .ext_ptr = &vcpu_sbi_ext_pmu, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_DBCN, + .ext_ptr = &vcpu_sbi_ext_dbcn, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_SUSP, + .ext_ptr = &vcpu_sbi_ext_susp, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_STA, + .ext_ptr = &vcpu_sbi_ext_sta, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_FWFT, + .ext_ptr = &vcpu_sbi_ext_fwft, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_MPXY, + .ext_ptr = &vcpu_sbi_ext_mpxy, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL, + .ext_ptr = &vcpu_sbi_ext_experimental, + }, + { + .ext_idx = KVM_RISCV_SBI_EXT_VENDOR, + .ext_ptr = &vcpu_sbi_ext_vendor, + }, }; -void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run) +static const struct kvm_riscv_sbi_extension_entry * +riscv_vcpu_get_sbi_ext(struct kvm_vcpu *vcpu, unsigned long idx) +{ + const struct kvm_riscv_sbi_extension_entry *sext = NULL; + + if (idx >= KVM_RISCV_SBI_EXT_MAX) + return NULL; + + for (int i = 0; i < ARRAY_SIZE(sbi_ext); i++) { + if (sbi_ext[i].ext_idx == idx) { + sext = &sbi_ext[i]; + break; + } + } + + return sext; +} + +static bool riscv_vcpu_supports_sbi_ext(struct kvm_vcpu *vcpu, int idx) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *sext; + + sext = riscv_vcpu_get_sbi_ext(vcpu, idx); + + return sext && scontext->ext_status[sext->ext_idx] != KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE; +} + +int kvm_riscv_vcpu_sbi_forward_handler(struct kvm_vcpu *vcpu, + struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) { struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -67,8 +141,10 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run) run->riscv_sbi.args[3] = cp->a3; run->riscv_sbi.args[4] = cp->a4; run->riscv_sbi.args[5] = cp->a5; - run->riscv_sbi.ret[0] = cp->a0; - run->riscv_sbi.ret[1] = cp->a1; + run->riscv_sbi.ret[0] = SBI_ERR_NOT_SUPPORTED; + run->riscv_sbi.ret[1] = 0; + retdata->uexit = true; + return 0; } void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, @@ -78,8 +154,11 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, unsigned long i; struct kvm_vcpu *tmp; - kvm_for_each_vcpu(i, tmp, vcpu->kvm) - tmp->arch.power_off = true; + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { + spin_lock(&tmp->arch.mp_state_lock); + WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); + spin_unlock(&tmp->arch.mp_state_lock); + } kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); memset(&run->system_event, 0, sizeof(run->system_event)); @@ -89,6 +168,34 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, run->exit_reason = KVM_EXIT_SYSTEM_EVENT; } +void kvm_riscv_vcpu_sbi_request_reset(struct kvm_vcpu *vcpu, + unsigned long pc, unsigned long a1) +{ + spin_lock(&vcpu->arch.reset_state.lock); + vcpu->arch.reset_state.pc = pc; + vcpu->arch.reset_state.a1 = a1; + spin_unlock(&vcpu->arch.reset_state.lock); + + kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); +} + +void kvm_riscv_vcpu_sbi_load_reset_state(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + struct kvm_vcpu_reset_state *reset_state = &vcpu->arch.reset_state; + + cntx->a0 = vcpu->vcpu_id; + + spin_lock(&vcpu->arch.reset_state.lock); + cntx->sepc = reset_state->pc; + cntx->a1 = reset_state->a1; + spin_unlock(&vcpu->arch.reset_state.lock); + + cntx->sstatus &= ~SR_SIE; + csr->vsatp = 0; +} + int kvm_riscv_vcpu_sbi_return(struct kvm_vcpu *vcpu, struct kvm_run *run) { struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -108,14 +215,370 @@ int kvm_riscv_vcpu_sbi_return(struct kvm_vcpu *vcpu, struct kvm_run *run) return 0; } -const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext(unsigned long extid) +static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long reg_val) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *sext; + + if (reg_val != 1 && reg_val != 0) + return -EINVAL; + + sext = riscv_vcpu_get_sbi_ext(vcpu, reg_num); + if (!sext || scontext->ext_status[sext->ext_idx] == KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE) + return -ENOENT; + + scontext->ext_status[sext->ext_idx] = (reg_val) ? + KVM_RISCV_SBI_EXT_STATUS_ENABLED : + KVM_RISCV_SBI_EXT_STATUS_DISABLED; + + return 0; +} + +static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long *reg_val) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *sext; + + sext = riscv_vcpu_get_sbi_ext(vcpu, reg_num); + if (!sext || scontext->ext_status[sext->ext_idx] == KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE) + return -ENOENT; + + *reg_val = scontext->ext_status[sext->ext_idx] == + KVM_RISCV_SBI_EXT_STATUS_ENABLED; + + return 0; +} + +static int riscv_vcpu_set_sbi_ext_multi(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long reg_val, bool enable) +{ + unsigned long i, ext_id; + + if (reg_num > KVM_REG_RISCV_SBI_MULTI_REG_LAST) + return -ENOENT; + + for_each_set_bit(i, ®_val, BITS_PER_LONG) { + ext_id = i + reg_num * BITS_PER_LONG; + if (ext_id >= KVM_RISCV_SBI_EXT_MAX) + break; + + riscv_vcpu_set_sbi_ext_single(vcpu, ext_id, enable); + } + + return 0; +} + +static int riscv_vcpu_get_sbi_ext_multi(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long *reg_val) +{ + unsigned long i, ext_id, ext_val; + + if (reg_num > KVM_REG_RISCV_SBI_MULTI_REG_LAST) + return -ENOENT; + + for (i = 0; i < BITS_PER_LONG; i++) { + ext_id = i + reg_num * BITS_PER_LONG; + if (ext_id >= KVM_RISCV_SBI_EXT_MAX) + break; + + ext_val = 0; + riscv_vcpu_get_sbi_ext_single(vcpu, ext_id, &ext_val); + if (ext_val) + *reg_val |= KVM_REG_RISCV_SBI_MULTI_MASK(ext_id); + } + + return 0; +} + +int kvm_riscv_vcpu_reg_indices_sbi_ext(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + unsigned int n = 0; + + for (int i = 0; i < KVM_RISCV_SBI_EXT_MAX; i++) { + u64 size = IS_ENABLED(CONFIG_32BIT) ? + KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_SBI_EXT | + KVM_REG_RISCV_SBI_SINGLE | i; + + if (!riscv_vcpu_supports_sbi_ext(vcpu, i)) + continue; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + + n++; + } + + return n; +} + +int kvm_riscv_vcpu_set_reg_sbi_ext(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_SBI_EXT); + unsigned long reg_val, reg_subtype; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + if (vcpu->arch.ran_atleast_once) + return -EBUSY; + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg_subtype) { + case KVM_REG_RISCV_SBI_SINGLE: + return riscv_vcpu_set_sbi_ext_single(vcpu, reg_num, reg_val); + case KVM_REG_RISCV_SBI_MULTI_EN: + return riscv_vcpu_set_sbi_ext_multi(vcpu, reg_num, reg_val, true); + case KVM_REG_RISCV_SBI_MULTI_DIS: + return riscv_vcpu_set_sbi_ext_multi(vcpu, reg_num, reg_val, false); + default: + return -ENOENT; + } + + return 0; +} + +int kvm_riscv_vcpu_get_reg_sbi_ext(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) { - int i = 0; + int rc; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_SBI_EXT); + unsigned long reg_val, reg_subtype; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + reg_val = 0; + switch (reg_subtype) { + case KVM_REG_RISCV_SBI_SINGLE: + rc = riscv_vcpu_get_sbi_ext_single(vcpu, reg_num, ®_val); + break; + case KVM_REG_RISCV_SBI_MULTI_EN: + case KVM_REG_RISCV_SBI_MULTI_DIS: + rc = riscv_vcpu_get_sbi_ext_multi(vcpu, reg_num, ®_val); + if (!rc && reg_subtype == KVM_REG_RISCV_SBI_MULTI_DIS) + reg_val = ~reg_val; + break; + default: + rc = -ENOENT; + } + if (rc) + return rc; + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +int kvm_riscv_vcpu_reg_indices_sbi(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *entry; + const struct kvm_vcpu_sbi_extension *ext; + unsigned long state_reg_count; + int i, j, rc, count = 0; + u64 reg; for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { - if (sbi_ext[i]->extid_start <= extid && - sbi_ext[i]->extid_end >= extid) - return sbi_ext[i]; + entry = &sbi_ext[i]; + ext = entry->ext_ptr; + + if (!ext->get_state_reg_count || + scontext->ext_status[entry->ext_idx] != KVM_RISCV_SBI_EXT_STATUS_ENABLED) + continue; + + state_reg_count = ext->get_state_reg_count(vcpu); + if (!uindices) + goto skip_put_user; + + for (j = 0; j < state_reg_count; j++) { + if (ext->get_state_reg_id) { + rc = ext->get_state_reg_id(vcpu, j, ®); + if (rc) + return rc; + } else { + reg = KVM_REG_RISCV | + (IS_ENABLED(CONFIG_32BIT) ? + KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64) | + KVM_REG_RISCV_SBI_STATE | + ext->state_reg_subtype | j; + } + + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + +skip_put_user: + count += state_reg_count; + } + + return count; +} + +static const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext_withstate(struct kvm_vcpu *vcpu, + unsigned long subtype) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *entry; + const struct kvm_vcpu_sbi_extension *ext; + int i; + + for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { + entry = &sbi_ext[i]; + ext = entry->ext_ptr; + + if (ext->get_state_reg_count && + ext->state_reg_subtype == subtype && + scontext->ext_status[entry->ext_idx] == KVM_RISCV_SBI_EXT_STATUS_ENABLED) + return ext; + } + + return NULL; +} + +int kvm_riscv_vcpu_set_reg_sbi(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_SBI_STATE); + const struct kvm_vcpu_sbi_extension *ext; + unsigned long reg_subtype; + void *reg_val; + u64 data64; + u32 data32; + u16 data16; + u8 data8; + + switch (KVM_REG_SIZE(reg->id)) { + case 1: + reg_val = &data8; + break; + case 2: + reg_val = &data16; + break; + case 4: + reg_val = &data32; + break; + case 8: + reg_val = &data64; + break; + default: + return -EINVAL; + } + + if (copy_from_user(reg_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + ext = kvm_vcpu_sbi_find_ext_withstate(vcpu, reg_subtype); + if (!ext || !ext->set_state_reg) + return -EINVAL; + + return ext->set_state_reg(vcpu, reg_num, KVM_REG_SIZE(reg->id), reg_val); +} + +int kvm_riscv_vcpu_get_reg_sbi(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_SBI_STATE); + const struct kvm_vcpu_sbi_extension *ext; + unsigned long reg_subtype; + void *reg_val; + u64 data64; + u32 data32; + u16 data16; + u8 data8; + int ret; + + switch (KVM_REG_SIZE(reg->id)) { + case 1: + reg_val = &data8; + break; + case 2: + reg_val = &data16; + break; + case 4: + reg_val = &data32; + break; + case 8: + reg_val = &data64; + break; + default: + return -EINVAL; + } + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + ext = kvm_vcpu_sbi_find_ext_withstate(vcpu, reg_subtype); + if (!ext || !ext->get_state_reg) + return -EINVAL; + + ret = ext->get_state_reg(vcpu, reg_num, KVM_REG_SIZE(reg->id), reg_val); + if (ret) + return ret; + + if (copy_to_user(uaddr, reg_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( + struct kvm_vcpu *vcpu, unsigned long extid) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *entry; + const struct kvm_vcpu_sbi_extension *ext; + int i; + + for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { + entry = &sbi_ext[i]; + ext = entry->ext_ptr; + + if (ext->extid_start <= extid && ext->extid_end >= extid) { + if (entry->ext_idx >= KVM_RISCV_SBI_EXT_MAX || + scontext->ext_status[entry->ext_idx] == + KVM_RISCV_SBI_EXT_STATUS_ENABLED) + return ext; + + return NULL; + } } return NULL; @@ -125,56 +588,138 @@ int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run) { int ret = 1; bool next_sepc = true; - bool userspace_exit = false; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; const struct kvm_vcpu_sbi_extension *sbi_ext; - struct kvm_cpu_trap utrap = { 0 }; - unsigned long out_val = 0; + struct kvm_cpu_trap utrap = {0}; + struct kvm_vcpu_sbi_return sbi_ret = { + .out_val = 0, + .err_val = 0, + .utrap = &utrap, + }; bool ext_is_v01 = false; - sbi_ext = kvm_vcpu_sbi_find_ext(cp->a7); + sbi_ext = kvm_vcpu_sbi_find_ext(vcpu, cp->a7); if (sbi_ext && sbi_ext->handler) { #ifdef CONFIG_RISCV_SBI_V01 if (cp->a7 >= SBI_EXT_0_1_SET_TIMER && cp->a7 <= SBI_EXT_0_1_SHUTDOWN) ext_is_v01 = true; #endif - ret = sbi_ext->handler(vcpu, run, &out_val, &utrap, &userspace_exit); + ret = sbi_ext->handler(vcpu, run, &sbi_ret); } else { /* Return error for unsupported SBI calls */ cp->a0 = SBI_ERR_NOT_SUPPORTED; goto ecall_done; } + /* + * When the SBI extension returns a Linux error code, it exits the ioctl + * loop and forwards the error to userspace. + */ + if (ret < 0) { + next_sepc = false; + goto ecall_done; + } + /* Handle special error cases i.e trap, exit or userspace forward */ - if (utrap.scause) { + if (sbi_ret.utrap->scause) { /* No need to increment sepc or exit ioctl loop */ ret = 1; - utrap.sepc = cp->sepc; - kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + sbi_ret.utrap->sepc = cp->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, sbi_ret.utrap); next_sepc = false; goto ecall_done; } /* Exit ioctl loop or Propagate the error code the guest */ - if (userspace_exit) { + if (sbi_ret.uexit) { next_sepc = false; ret = 0; } else { - /** - * SBI extension handler always returns an Linux error code. Convert - * it to the SBI specific error code that can be propagated the SBI - * caller. - */ - ret = kvm_linux_err_map_sbi(ret); - cp->a0 = ret; + cp->a0 = sbi_ret.err_val; ret = 1; } ecall_done: if (next_sepc) cp->sepc += 4; - if (!ext_is_v01) - cp->a1 = out_val; + /* a1 should only be updated when we continue the ioctl loop */ + if (!ext_is_v01 && ret == 1) + cp->a1 = sbi_ret.out_val; return ret; } + +void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *entry; + const struct kvm_vcpu_sbi_extension *ext; + int idx, i; + + for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { + entry = &sbi_ext[i]; + ext = entry->ext_ptr; + idx = entry->ext_idx; + + if (idx < 0 || idx >= ARRAY_SIZE(scontext->ext_status)) + continue; + + if (ext->probe && !ext->probe(vcpu)) { + scontext->ext_status[idx] = KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE; + continue; + } + + scontext->ext_status[idx] = ext->default_disabled ? + KVM_RISCV_SBI_EXT_STATUS_DISABLED : + KVM_RISCV_SBI_EXT_STATUS_ENABLED; + + if (ext->init && ext->init(vcpu) != 0) + scontext->ext_status[idx] = KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE; + } +} + +void kvm_riscv_vcpu_sbi_deinit(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *entry; + const struct kvm_vcpu_sbi_extension *ext; + int idx, i; + + for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { + entry = &sbi_ext[i]; + ext = entry->ext_ptr; + idx = entry->ext_idx; + + if (idx < 0 || idx >= ARRAY_SIZE(scontext->ext_status)) + continue; + + if (scontext->ext_status[idx] == KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE || + !ext->deinit) + continue; + + ext->deinit(vcpu); + } +} + +void kvm_riscv_vcpu_sbi_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *entry; + const struct kvm_vcpu_sbi_extension *ext; + int idx, i; + + for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { + entry = &sbi_ext[i]; + ext = entry->ext_ptr; + idx = entry->ext_idx; + + if (idx < 0 || idx >= ARRAY_SIZE(scontext->ext_status)) + continue; + + if (scontext->ext_status[idx] != KVM_RISCV_SBI_EXT_STATUS_ENABLED || + !ext->reset) + continue; + + ext->reset(vcpu); + } +} diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c index 5d65c634d301..06fdd5f69364 100644 --- a/arch/riscv/kvm/vcpu_sbi_base.c +++ b/arch/riscv/kvm/vcpu_sbi_base.c @@ -14,11 +14,11 @@ #include <asm/kvm_vcpu_sbi.h> static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *trap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { - int ret = 0; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + const struct kvm_vcpu_sbi_extension *sbi_ext; + unsigned long *out_val = &retdata->out_val; switch (cp->a6) { case SBI_EXT_BASE_GET_SPEC_VERSION: @@ -41,10 +41,12 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, * For experimental/vendor extensions * forward it to the userspace */ - kvm_riscv_vcpu_sbi_forward(vcpu, run); - *exit = true; - } else - *out_val = kvm_vcpu_sbi_find_ext(cp->a0) ? 1 : 0; + return kvm_riscv_vcpu_sbi_forward_handler(vcpu, run, retdata); + } else { + sbi_ext = kvm_vcpu_sbi_find_ext(vcpu, cp->a0); + *out_val = sbi_ext && sbi_ext->probe ? + sbi_ext->probe(vcpu) : !!sbi_ext; + } break; case SBI_EXT_BASE_GET_MVENDORID: *out_val = vcpu->arch.mvendorid; @@ -56,11 +58,11 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, *out_val = vcpu->arch.mimpid; break; default: - ret = -EOPNOTSUPP; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; break; } - return ret; + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = { @@ -68,30 +70,3 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = { .extid_end = SBI_EXT_BASE, .handler = kvm_sbi_ext_base_handler, }; - -static int kvm_sbi_ext_forward_handler(struct kvm_vcpu *vcpu, - struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, - bool *exit) -{ - /* - * Both SBI experimental and vendor extensions are - * unconditionally forwarded to userspace. - */ - kvm_riscv_vcpu_sbi_forward(vcpu, run); - *exit = true; - return 0; -} - -const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental = { - .extid_start = SBI_EXT_EXPERIMENTAL_START, - .extid_end = SBI_EXT_EXPERIMENTAL_END, - .handler = kvm_sbi_ext_forward_handler, -}; - -const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor = { - .extid_start = SBI_EXT_VENDOR_START, - .extid_end = SBI_EXT_VENDOR_END, - .handler = kvm_sbi_ext_forward_handler, -}; diff --git a/arch/riscv/kvm/vcpu_sbi_forward.c b/arch/riscv/kvm/vcpu_sbi_forward.c new file mode 100644 index 000000000000..5a3c75eb23c5 --- /dev/null +++ b/arch/riscv/kvm/vcpu_sbi_forward.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Ventana Micro Systems Inc. + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_vcpu_sbi.h> +#include <asm/sbi.h> + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental = { + .extid_start = SBI_EXT_EXPERIMENTAL_START, + .extid_end = SBI_EXT_EXPERIMENTAL_END, + .handler = kvm_riscv_vcpu_sbi_forward_handler, +}; + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor = { + .extid_start = SBI_EXT_VENDOR_START, + .extid_end = SBI_EXT_VENDOR_END, + .handler = kvm_riscv_vcpu_sbi_forward_handler, +}; + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn = { + .extid_start = SBI_EXT_DBCN, + .extid_end = SBI_EXT_DBCN, + .default_disabled = true, + .handler = kvm_riscv_vcpu_sbi_forward_handler, +}; + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_mpxy = { + .extid_start = SBI_EXT_MPXY, + .extid_end = SBI_EXT_MPXY, + .default_disabled = true, + .handler = kvm_riscv_vcpu_sbi_forward_handler, +}; diff --git a/arch/riscv/kvm/vcpu_sbi_fwft.c b/arch/riscv/kvm/vcpu_sbi_fwft.c new file mode 100644 index 000000000000..62cc9c3d5759 --- /dev/null +++ b/arch/riscv/kvm/vcpu_sbi_fwft.c @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Rivos Inc. + * + * Authors: + * Clément Léger <cleger@rivosinc.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <asm/cpufeature.h> +#include <asm/sbi.h> +#include <asm/kvm_vcpu_sbi.h> +#include <asm/kvm_vcpu_sbi_fwft.h> + +#define MIS_DELEG (BIT_ULL(EXC_LOAD_MISALIGNED) | BIT_ULL(EXC_STORE_MISALIGNED)) + +struct kvm_sbi_fwft_feature { + /** + * @id: Feature ID + */ + enum sbi_fwft_feature_t id; + + /** + * @first_reg_num: ONE_REG index of the first ONE_REG register + */ + unsigned long first_reg_num; + + /** + * @supported: Check if the feature is supported on the vcpu + * + * This callback is optional, if not provided the feature is assumed to + * be supported + */ + bool (*supported)(struct kvm_vcpu *vcpu); + + /** + * @reset: Reset the feature value irrespective whether feature is supported or not + * + * This callback is mandatory + */ + void (*reset)(struct kvm_vcpu *vcpu); + + /** + * @set: Set the feature value + * + * Return SBI_SUCCESS on success or an SBI error (SBI_ERR_*) + * + * This callback is mandatory + */ + long (*set)(struct kvm_vcpu *vcpu, struct kvm_sbi_fwft_config *conf, + bool one_reg_access, unsigned long value); + + /** + * @get: Get the feature current value + * + * Return SBI_SUCCESS on success or an SBI error (SBI_ERR_*) + * + * This callback is mandatory + */ + long (*get)(struct kvm_vcpu *vcpu, struct kvm_sbi_fwft_config *conf, + bool one_reg_access, unsigned long *value); +}; + +static const enum sbi_fwft_feature_t kvm_fwft_defined_features[] = { + SBI_FWFT_MISALIGNED_EXC_DELEG, + SBI_FWFT_LANDING_PAD, + SBI_FWFT_SHADOW_STACK, + SBI_FWFT_DOUBLE_TRAP, + SBI_FWFT_PTE_AD_HW_UPDATING, + SBI_FWFT_POINTER_MASKING_PMLEN, +}; + +static bool kvm_fwft_is_defined_feature(enum sbi_fwft_feature_t feature) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(kvm_fwft_defined_features); i++) { + if (kvm_fwft_defined_features[i] == feature) + return true; + } + + return false; +} + +static bool kvm_sbi_fwft_misaligned_delegation_supported(struct kvm_vcpu *vcpu) +{ + return misaligned_traps_can_delegate(); +} + +static void kvm_sbi_fwft_reset_misaligned_delegation(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + cfg->hedeleg &= ~MIS_DELEG; +} + +static long kvm_sbi_fwft_set_misaligned_delegation(struct kvm_vcpu *vcpu, + struct kvm_sbi_fwft_config *conf, + bool one_reg_access, unsigned long value) +{ + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + if (value == 1) { + cfg->hedeleg |= MIS_DELEG; + if (!one_reg_access) + csr_set(CSR_HEDELEG, MIS_DELEG); + } else if (value == 0) { + cfg->hedeleg &= ~MIS_DELEG; + if (!one_reg_access) + csr_clear(CSR_HEDELEG, MIS_DELEG); + } else { + return SBI_ERR_INVALID_PARAM; + } + + return SBI_SUCCESS; +} + +static long kvm_sbi_fwft_get_misaligned_delegation(struct kvm_vcpu *vcpu, + struct kvm_sbi_fwft_config *conf, + bool one_reg_access, unsigned long *value) +{ + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + *value = (cfg->hedeleg & MIS_DELEG) == MIS_DELEG; + return SBI_SUCCESS; +} + +#ifndef CONFIG_32BIT + +static bool try_to_set_pmm(unsigned long value) +{ + csr_set(CSR_HENVCFG, value); + return (csr_read_clear(CSR_HENVCFG, ENVCFG_PMM) & ENVCFG_PMM) == value; +} + +static bool kvm_sbi_fwft_pointer_masking_pmlen_supported(struct kvm_vcpu *vcpu) +{ + struct kvm_sbi_fwft *fwft = vcpu_to_fwft(vcpu); + + if (!riscv_isa_extension_available(vcpu->arch.isa, SMNPM)) + return false; + + fwft->have_vs_pmlen_7 = try_to_set_pmm(ENVCFG_PMM_PMLEN_7); + fwft->have_vs_pmlen_16 = try_to_set_pmm(ENVCFG_PMM_PMLEN_16); + + return fwft->have_vs_pmlen_7 || fwft->have_vs_pmlen_16; +} + +static void kvm_sbi_fwft_reset_pointer_masking_pmlen(struct kvm_vcpu *vcpu) +{ + vcpu->arch.cfg.henvcfg &= ~ENVCFG_PMM; +} + +static long kvm_sbi_fwft_set_pointer_masking_pmlen(struct kvm_vcpu *vcpu, + struct kvm_sbi_fwft_config *conf, + bool one_reg_access, unsigned long value) +{ + struct kvm_sbi_fwft *fwft = vcpu_to_fwft(vcpu); + unsigned long pmm; + + switch (value) { + case 0: + pmm = ENVCFG_PMM_PMLEN_0; + break; + case 7: + if (!fwft->have_vs_pmlen_7) + return SBI_ERR_INVALID_PARAM; + pmm = ENVCFG_PMM_PMLEN_7; + break; + case 16: + if (!fwft->have_vs_pmlen_16) + return SBI_ERR_INVALID_PARAM; + pmm = ENVCFG_PMM_PMLEN_16; + break; + default: + return SBI_ERR_INVALID_PARAM; + } + + vcpu->arch.cfg.henvcfg &= ~ENVCFG_PMM; + vcpu->arch.cfg.henvcfg |= pmm; + + /* + * Instead of waiting for vcpu_load/put() to update HENVCFG CSR, + * update here so that VCPU see's pointer masking mode change + * immediately. + */ + if (!one_reg_access) + csr_write(CSR_HENVCFG, vcpu->arch.cfg.henvcfg); + + return SBI_SUCCESS; +} + +static long kvm_sbi_fwft_get_pointer_masking_pmlen(struct kvm_vcpu *vcpu, + struct kvm_sbi_fwft_config *conf, + bool one_reg_access, unsigned long *value) +{ + switch (vcpu->arch.cfg.henvcfg & ENVCFG_PMM) { + case ENVCFG_PMM_PMLEN_0: + *value = 0; + break; + case ENVCFG_PMM_PMLEN_7: + *value = 7; + break; + case ENVCFG_PMM_PMLEN_16: + *value = 16; + break; + default: + return SBI_ERR_FAILURE; + } + + return SBI_SUCCESS; +} + +#endif + +static const struct kvm_sbi_fwft_feature features[] = { + { + .id = SBI_FWFT_MISALIGNED_EXC_DELEG, + .first_reg_num = offsetof(struct kvm_riscv_sbi_fwft, misaligned_deleg.enable) / + sizeof(unsigned long), + .supported = kvm_sbi_fwft_misaligned_delegation_supported, + .reset = kvm_sbi_fwft_reset_misaligned_delegation, + .set = kvm_sbi_fwft_set_misaligned_delegation, + .get = kvm_sbi_fwft_get_misaligned_delegation, + }, +#ifndef CONFIG_32BIT + { + .id = SBI_FWFT_POINTER_MASKING_PMLEN, + .first_reg_num = offsetof(struct kvm_riscv_sbi_fwft, pointer_masking.enable) / + sizeof(unsigned long), + .supported = kvm_sbi_fwft_pointer_masking_pmlen_supported, + .reset = kvm_sbi_fwft_reset_pointer_masking_pmlen, + .set = kvm_sbi_fwft_set_pointer_masking_pmlen, + .get = kvm_sbi_fwft_get_pointer_masking_pmlen, + }, +#endif +}; + +static const struct kvm_sbi_fwft_feature *kvm_sbi_fwft_regnum_to_feature(unsigned long reg_num) +{ + const struct kvm_sbi_fwft_feature *feature; + int i; + + for (i = 0; i < ARRAY_SIZE(features); i++) { + feature = &features[i]; + if (feature->first_reg_num <= reg_num && reg_num < (feature->first_reg_num + 3)) + return feature; + } + + return NULL; +} + +static struct kvm_sbi_fwft_config * +kvm_sbi_fwft_get_config(struct kvm_vcpu *vcpu, enum sbi_fwft_feature_t feature) +{ + int i; + struct kvm_sbi_fwft *fwft = vcpu_to_fwft(vcpu); + + for (i = 0; i < ARRAY_SIZE(features); i++) { + if (fwft->configs[i].feature->id == feature) + return &fwft->configs[i]; + } + + return NULL; +} + +static int kvm_fwft_get_feature(struct kvm_vcpu *vcpu, u32 feature, + struct kvm_sbi_fwft_config **conf) +{ + struct kvm_sbi_fwft_config *tconf; + + tconf = kvm_sbi_fwft_get_config(vcpu, feature); + if (!tconf) { + if (kvm_fwft_is_defined_feature(feature)) + return SBI_ERR_NOT_SUPPORTED; + + return SBI_ERR_DENIED; + } + + if (!tconf->supported || !tconf->enabled) + return SBI_ERR_NOT_SUPPORTED; + + *conf = tconf; + + return SBI_SUCCESS; +} + +static int kvm_sbi_fwft_set(struct kvm_vcpu *vcpu, u32 feature, + unsigned long value, unsigned long flags) +{ + int ret; + struct kvm_sbi_fwft_config *conf; + + ret = kvm_fwft_get_feature(vcpu, feature, &conf); + if (ret) + return ret; + + if ((flags & ~SBI_FWFT_SET_FLAG_LOCK) != 0) + return SBI_ERR_INVALID_PARAM; + + if (conf->flags & SBI_FWFT_SET_FLAG_LOCK) + return SBI_ERR_DENIED_LOCKED; + + conf->flags = flags; + + return conf->feature->set(vcpu, conf, false, value); +} + +static int kvm_sbi_fwft_get(struct kvm_vcpu *vcpu, unsigned long feature, + unsigned long *value) +{ + int ret; + struct kvm_sbi_fwft_config *conf; + + ret = kvm_fwft_get_feature(vcpu, feature, &conf); + if (ret) + return ret; + + return conf->feature->get(vcpu, conf, false, value); +} + +static int kvm_sbi_ext_fwft_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) +{ + int ret; + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + unsigned long funcid = cp->a6; + + switch (funcid) { + case SBI_EXT_FWFT_SET: + ret = kvm_sbi_fwft_set(vcpu, cp->a0, cp->a1, cp->a2); + break; + case SBI_EXT_FWFT_GET: + ret = kvm_sbi_fwft_get(vcpu, cp->a0, &retdata->out_val); + break; + default: + ret = SBI_ERR_NOT_SUPPORTED; + break; + } + + retdata->err_val = ret; + + return 0; +} + +static int kvm_sbi_ext_fwft_init(struct kvm_vcpu *vcpu) +{ + struct kvm_sbi_fwft *fwft = vcpu_to_fwft(vcpu); + const struct kvm_sbi_fwft_feature *feature; + struct kvm_sbi_fwft_config *conf; + int i; + + fwft->configs = kcalloc(ARRAY_SIZE(features), sizeof(struct kvm_sbi_fwft_config), + GFP_KERNEL); + if (!fwft->configs) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(features); i++) { + feature = &features[i]; + conf = &fwft->configs[i]; + if (feature->supported) + conf->supported = feature->supported(vcpu); + else + conf->supported = true; + + conf->enabled = conf->supported; + conf->feature = feature; + } + + return 0; +} + +static void kvm_sbi_ext_fwft_deinit(struct kvm_vcpu *vcpu) +{ + struct kvm_sbi_fwft *fwft = vcpu_to_fwft(vcpu); + + kfree(fwft->configs); +} + +static void kvm_sbi_ext_fwft_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_sbi_fwft *fwft = vcpu_to_fwft(vcpu); + const struct kvm_sbi_fwft_feature *feature; + int i; + + for (i = 0; i < ARRAY_SIZE(features); i++) { + fwft->configs[i].flags = 0; + feature = &features[i]; + if (feature->reset) + feature->reset(vcpu); + } +} + +static unsigned long kvm_sbi_ext_fwft_get_reg_count(struct kvm_vcpu *vcpu) +{ + unsigned long max_reg_count = sizeof(struct kvm_riscv_sbi_fwft) / sizeof(unsigned long); + const struct kvm_sbi_fwft_feature *feature; + struct kvm_sbi_fwft_config *conf; + unsigned long reg, ret = 0; + + for (reg = 0; reg < max_reg_count; reg++) { + feature = kvm_sbi_fwft_regnum_to_feature(reg); + if (!feature) + continue; + + conf = kvm_sbi_fwft_get_config(vcpu, feature->id); + if (!conf || !conf->supported) + continue; + + ret++; + } + + return ret; +} + +static int kvm_sbi_ext_fwft_get_reg_id(struct kvm_vcpu *vcpu, int index, u64 *reg_id) +{ + int reg, max_reg_count = sizeof(struct kvm_riscv_sbi_fwft) / sizeof(unsigned long); + const struct kvm_sbi_fwft_feature *feature; + struct kvm_sbi_fwft_config *conf; + int idx = 0; + + for (reg = 0; reg < max_reg_count; reg++) { + feature = kvm_sbi_fwft_regnum_to_feature(reg); + if (!feature) + continue; + + conf = kvm_sbi_fwft_get_config(vcpu, feature->id); + if (!conf || !conf->supported) + continue; + + if (index == idx) { + *reg_id = KVM_REG_RISCV | + (IS_ENABLED(CONFIG_32BIT) ? + KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64) | + KVM_REG_RISCV_SBI_STATE | + KVM_REG_RISCV_SBI_FWFT | reg; + return 0; + } + + idx++; + } + + return -ENOENT; +} + +static int kvm_sbi_ext_fwft_get_reg(struct kvm_vcpu *vcpu, unsigned long reg_num, + unsigned long reg_size, void *reg_val) +{ + const struct kvm_sbi_fwft_feature *feature; + struct kvm_sbi_fwft_config *conf; + unsigned long *value; + int ret = 0; + + if (reg_size != sizeof(unsigned long)) + return -EINVAL; + value = reg_val; + + feature = kvm_sbi_fwft_regnum_to_feature(reg_num); + if (!feature) + return -ENOENT; + + conf = kvm_sbi_fwft_get_config(vcpu, feature->id); + if (!conf || !conf->supported) + return -ENOENT; + + switch (reg_num - feature->first_reg_num) { + case 0: + *value = conf->enabled; + break; + case 1: + *value = conf->flags; + break; + case 2: + ret = conf->feature->get(vcpu, conf, true, value); + break; + default: + return -ENOENT; + } + + return sbi_err_map_linux_errno(ret); +} + +static int kvm_sbi_ext_fwft_set_reg(struct kvm_vcpu *vcpu, unsigned long reg_num, + unsigned long reg_size, const void *reg_val) +{ + const struct kvm_sbi_fwft_feature *feature; + struct kvm_sbi_fwft_config *conf; + unsigned long value; + int ret = 0; + + if (reg_size != sizeof(unsigned long)) + return -EINVAL; + value = *(const unsigned long *)reg_val; + + feature = kvm_sbi_fwft_regnum_to_feature(reg_num); + if (!feature) + return -ENOENT; + + conf = kvm_sbi_fwft_get_config(vcpu, feature->id); + if (!conf || !conf->supported) + return -ENOENT; + + switch (reg_num - feature->first_reg_num) { + case 0: + switch (value) { + case 0: + conf->enabled = false; + break; + case 1: + conf->enabled = true; + break; + default: + return -EINVAL; + } + break; + case 1: + conf->flags = value & SBI_FWFT_SET_FLAG_LOCK; + break; + case 2: + ret = conf->feature->set(vcpu, conf, true, value); + break; + default: + return -ENOENT; + } + + return sbi_err_map_linux_errno(ret); +} + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_fwft = { + .extid_start = SBI_EXT_FWFT, + .extid_end = SBI_EXT_FWFT, + .handler = kvm_sbi_ext_fwft_handler, + .init = kvm_sbi_ext_fwft_init, + .deinit = kvm_sbi_ext_fwft_deinit, + .reset = kvm_sbi_ext_fwft_reset, + .state_reg_subtype = KVM_REG_RISCV_SBI_FWFT, + .get_state_reg_count = kvm_sbi_ext_fwft_get_reg_count, + .get_state_reg_id = kvm_sbi_ext_fwft_get_reg_id, + .get_state_reg = kvm_sbi_ext_fwft_get_reg, + .set_state_reg = kvm_sbi_ext_fwft_set_reg, +}; diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c index 2e915cafd551..f26207f84bab 100644 --- a/arch/riscv/kvm/vcpu_sbi_hsm.c +++ b/arch/riscv/kvm/vcpu_sbi_hsm.c @@ -9,44 +9,55 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/kvm_host.h> +#include <linux/wordpart.h> #include <asm/sbi.h> #include <asm/kvm_vcpu_sbi.h> static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu) { - struct kvm_cpu_context *reset_cntx; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; struct kvm_vcpu *target_vcpu; unsigned long target_vcpuid = cp->a0; + int ret = 0; target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); if (!target_vcpu) - return -EINVAL; - if (!target_vcpu->arch.power_off) - return -EALREADY; + return SBI_ERR_INVALID_PARAM; - reset_cntx = &target_vcpu->arch.guest_reset_context; - /* start address */ - reset_cntx->sepc = cp->a1; - /* target vcpu id to start */ - reset_cntx->a0 = target_vcpuid; - /* private data passed from kernel */ - reset_cntx->a1 = cp->a2; - kvm_make_request(KVM_REQ_VCPU_RESET, target_vcpu); + spin_lock(&target_vcpu->arch.mp_state_lock); - kvm_riscv_vcpu_power_on(target_vcpu); + if (!kvm_riscv_vcpu_stopped(target_vcpu)) { + ret = SBI_ERR_ALREADY_AVAILABLE; + goto out; + } - return 0; + kvm_riscv_vcpu_sbi_request_reset(target_vcpu, cp->a1, cp->a2); + + __kvm_riscv_vcpu_power_on(target_vcpu); + +out: + spin_unlock(&target_vcpu->arch.mp_state_lock); + + return ret; } static int kvm_sbi_hsm_vcpu_stop(struct kvm_vcpu *vcpu) { - if (vcpu->arch.power_off) - return -EINVAL; + int ret = 0; - kvm_riscv_vcpu_power_off(vcpu); + spin_lock(&vcpu->arch.mp_state_lock); - return 0; + if (kvm_riscv_vcpu_stopped(vcpu)) { + ret = SBI_ERR_FAILURE; + goto out; + } + + __kvm_riscv_vcpu_power_off(vcpu); + +out: + spin_unlock(&vcpu->arch.mp_state_lock); + + return ret; } static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu) @@ -57,30 +68,25 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu) target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); if (!target_vcpu) - return -EINVAL; - if (!target_vcpu->arch.power_off) - return SBI_HSM_STATE_STARTED; - else if (vcpu->stat.generic.blocking) + return SBI_ERR_INVALID_PARAM; + if (kvm_riscv_vcpu_stopped(target_vcpu)) + return SBI_HSM_STATE_STOPPED; + else if (target_vcpu->stat.generic.blocking) return SBI_HSM_STATE_SUSPENDED; else - return SBI_HSM_STATE_STOPPED; + return SBI_HSM_STATE_STARTED; } static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, - bool *exit) + struct kvm_vcpu_sbi_return *retdata) { int ret = 0; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; - struct kvm *kvm = vcpu->kvm; unsigned long funcid = cp->a6; switch (funcid) { case SBI_EXT_HSM_HART_START: - mutex_lock(&kvm->lock); ret = kvm_sbi_hsm_vcpu_start(vcpu); - mutex_unlock(&kvm->lock); break; case SBI_EXT_HSM_HART_STOP: ret = kvm_sbi_hsm_vcpu_stop(vcpu); @@ -88,27 +94,29 @@ static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, case SBI_EXT_HSM_HART_STATUS: ret = kvm_sbi_hsm_vcpu_get_status(vcpu); if (ret >= 0) { - *out_val = ret; - ret = 0; + retdata->out_val = ret; + retdata->err_val = 0; } - break; + return 0; case SBI_EXT_HSM_HART_SUSPEND: - switch (cp->a0) { + switch (lower_32_bits(cp->a0)) { case SBI_HSM_SUSPEND_RET_DEFAULT: kvm_riscv_vcpu_wfi(vcpu); break; case SBI_HSM_SUSPEND_NON_RET_DEFAULT: - ret = -EOPNOTSUPP; + ret = SBI_ERR_NOT_SUPPORTED; break; default: - ret = -EINVAL; + ret = SBI_ERR_INVALID_PARAM; } break; default: - ret = -EOPNOTSUPP; + ret = SBI_ERR_NOT_SUPPORTED; } - return ret; + retdata->err_val = ret; + + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm = { diff --git a/arch/riscv/kvm/vcpu_sbi_pmu.c b/arch/riscv/kvm/vcpu_sbi_pmu.c new file mode 100644 index 000000000000..a020d979d179 --- /dev/null +++ b/arch/riscv/kvm/vcpu_sbi_pmu.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Rivos Inc + * + * Authors: + * Atish Patra <atishp@rivosinc.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <asm/csr.h> +#include <asm/sbi.h> +#include <asm/kvm_vcpu_sbi.h> + +static int kvm_sbi_ext_pmu_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) +{ + int ret = 0; + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + unsigned long funcid = cp->a6; + u64 temp; + + if (!kvpmu->init_done) { + retdata->err_val = SBI_ERR_NOT_SUPPORTED; + return 0; + } + + switch (funcid) { + case SBI_EXT_PMU_NUM_COUNTERS: + ret = kvm_riscv_vcpu_pmu_num_ctrs(vcpu, retdata); + break; + case SBI_EXT_PMU_COUNTER_GET_INFO: + ret = kvm_riscv_vcpu_pmu_ctr_info(vcpu, cp->a0, retdata); + break; + case SBI_EXT_PMU_COUNTER_CFG_MATCH: +#if defined(CONFIG_32BIT) + temp = ((uint64_t)cp->a5 << 32) | cp->a4; +#else + temp = cp->a4; +#endif + /* + * This can fail if perf core framework fails to create an event. + * No need to forward the error to userspace and exit the guest. + * The operation can continue without profiling. Forward the + * appropriate SBI error to the guest. + */ + ret = kvm_riscv_vcpu_pmu_ctr_cfg_match(vcpu, cp->a0, cp->a1, + cp->a2, cp->a3, temp, retdata); + break; + case SBI_EXT_PMU_COUNTER_START: +#if defined(CONFIG_32BIT) + temp = ((uint64_t)cp->a4 << 32) | cp->a3; +#else + temp = cp->a3; +#endif + ret = kvm_riscv_vcpu_pmu_ctr_start(vcpu, cp->a0, cp->a1, cp->a2, + temp, retdata); + break; + case SBI_EXT_PMU_COUNTER_STOP: + ret = kvm_riscv_vcpu_pmu_ctr_stop(vcpu, cp->a0, cp->a1, cp->a2, retdata); + break; + case SBI_EXT_PMU_COUNTER_FW_READ: + ret = kvm_riscv_vcpu_pmu_fw_ctr_read(vcpu, cp->a0, retdata); + break; + case SBI_EXT_PMU_COUNTER_FW_READ_HI: + if (IS_ENABLED(CONFIG_32BIT)) + ret = kvm_riscv_vcpu_pmu_fw_ctr_read_hi(vcpu, cp->a0, retdata); + else + retdata->out_val = 0; + break; + case SBI_EXT_PMU_SNAPSHOT_SET_SHMEM: + ret = kvm_riscv_vcpu_pmu_snapshot_set_shmem(vcpu, cp->a0, cp->a1, cp->a2, retdata); + break; + case SBI_EXT_PMU_EVENT_GET_INFO: + ret = kvm_riscv_vcpu_pmu_event_info(vcpu, cp->a0, cp->a1, cp->a2, cp->a3, retdata); + break; + default: + retdata->err_val = SBI_ERR_NOT_SUPPORTED; + } + + return ret; +} + +static unsigned long kvm_sbi_ext_pmu_probe(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + + return kvpmu->init_done; +} + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = { + .extid_start = SBI_EXT_PMU, + .extid_end = SBI_EXT_PMU, + .handler = kvm_sbi_ext_pmu_handler, + .probe = kvm_sbi_ext_pmu_probe, +}; diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 03a0198389f0..506a510b6bff 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -11,19 +11,21 @@ #include <linux/kvm_host.h> #include <asm/sbi.h> #include <asm/kvm_vcpu_timer.h> +#include <asm/kvm_vcpu_pmu.h> #include <asm/kvm_vcpu_sbi.h> static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { - int ret = 0; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; u64 next_cycle; - if (cp->a6 != SBI_EXT_TIME_SET_TIMER) - return -EINVAL; + if (cp->a6 != SBI_EXT_TIME_SET_TIMER) { + retdata->err_val = SBI_ERR_NOT_SUPPORTED; + return 0; + } + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_SET_TIMER); #if __riscv_xlen == 32 next_cycle = ((u64)cp->a1 << 32) | (u64)cp->a0; #else @@ -31,7 +33,7 @@ static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, #endif kvm_riscv_vcpu_timer_next_event(vcpu, next_cycle); - return ret; + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_time = { @@ -41,8 +43,7 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_time = { }; static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { int ret = 0; unsigned long i; @@ -50,22 +51,35 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; + unsigned long hart_bit = 0, sentmask = 0; - if (cp->a6 != SBI_EXT_IPI_SEND_IPI) - return -EINVAL; + if (cp->a6 != SBI_EXT_IPI_SEND_IPI) { + retdata->err_val = SBI_ERR_NOT_SUPPORTED; + return 0; + } + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_IPI_SENT); kvm_for_each_vcpu(i, tmp, vcpu->kvm) { if (hbase != -1UL) { if (tmp->vcpu_id < hbase) continue; - if (!(hmask & (1UL << (tmp->vcpu_id - hbase)))) + hart_bit = tmp->vcpu_id - hbase; + if (hart_bit >= __riscv_xlen) + goto done; + if (!(hmask & (1UL << hart_bit))) continue; } ret = kvm_riscv_vcpu_set_interrupt(tmp, IRQ_VS_SOFT); if (ret < 0) break; + sentmask |= 1UL << hart_bit; + kvm_riscv_vcpu_pmu_incr_fw(tmp, SBI_PMU_FW_IPI_RCVD); } +done: + if (hbase != -1UL && (hmask ^ sentmask)) + retdata->err_val = SBI_ERR_INVALID_PARAM; + return ret; } @@ -76,35 +90,37 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_ipi = { }; static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { - int ret = 0; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; unsigned long funcid = cp->a6; + unsigned long vmid; switch (funcid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: kvm_riscv_fence_i(vcpu->kvm, hbase, hmask); + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - if (cp->a2 == 0 && cp->a3 == 0) - kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask); + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); + if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) + kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask, vmid); else kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask, - cp->a2, cp->a3, PAGE_SHIFT); + cp->a2, cp->a3, PAGE_SHIFT, vmid); + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - if (cp->a2 == 0 && cp->a3 == 0) - kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, - hbase, hmask, cp->a4); + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); + if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) + kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, hbase, hmask, + cp->a4, vmid); else - kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, - hbase, hmask, - cp->a2, cp->a3, - PAGE_SHIFT, cp->a4); + kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, hbase, hmask, cp->a2, + cp->a3, PAGE_SHIFT, cp->a4, vmid); + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_SENT); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID: @@ -112,14 +128,14 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: /* * Until nested virtualization is implemented, the - * SBI HFENCE calls should be treated as NOPs + * SBI HFENCE calls should return not supported + * hence fallthrough. */ - break; default: - ret = -EOPNOTSUPP; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; } - return ret; + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence = { @@ -130,14 +146,12 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence = { static int kvm_sbi_ext_srst_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long funcid = cp->a6; u32 reason = cp->a1; u32 type = cp->a0; - int ret = 0; switch (funcid) { case SBI_EXT_SRST_RESET: @@ -146,24 +160,24 @@ static int kvm_sbi_ext_srst_handler(struct kvm_vcpu *vcpu, kvm_riscv_vcpu_sbi_system_reset(vcpu, run, KVM_SYSTEM_EVENT_SHUTDOWN, reason); - *exit = true; + retdata->uexit = true; break; case SBI_SRST_RESET_TYPE_COLD_REBOOT: case SBI_SRST_RESET_TYPE_WARM_REBOOT: kvm_riscv_vcpu_sbi_system_reset(vcpu, run, KVM_SYSTEM_EVENT_RESET, reason); - *exit = true; + retdata->uexit = true; break; default: - ret = -EOPNOTSUPP; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; } break; default: - ret = -EOPNOTSUPP; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; } - return ret; + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst = { diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c new file mode 100644 index 000000000000..afa0545c3bcf --- /dev/null +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Ventana Micro Systems Inc. + */ + +#include <linux/kconfig.h> +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/mm.h> +#include <linux/sizes.h> + +#include <asm/bug.h> +#include <asm/current.h> +#include <asm/kvm_vcpu_sbi.h> +#include <asm/page.h> +#include <asm/sbi.h> +#include <asm/uaccess.h> + +static void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu) +{ + vcpu->arch.sta.shmem = INVALID_GPA; + vcpu->arch.sta.last_steal = 0; +} + +void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu) +{ + gpa_t shmem = vcpu->arch.sta.shmem; + u64 last_steal = vcpu->arch.sta.last_steal; + __le32 __user *sequence_ptr; + __le64 __user *steal_ptr; + __le32 sequence_le; + __le64 steal_le; + u32 sequence; + u64 steal; + unsigned long hva; + gfn_t gfn; + + if (shmem == INVALID_GPA) + return; + + /* + * shmem is 64-byte aligned (see the enforcement in + * kvm_sbi_sta_steal_time_set_shmem()) and the size of sbi_sta_struct + * is 64 bytes, so we know all its offsets are in the same page. + */ + gfn = shmem >> PAGE_SHIFT; + hva = kvm_vcpu_gfn_to_hva(vcpu, gfn); + + if (WARN_ON(kvm_is_error_hva(hva))) { + vcpu->arch.sta.shmem = INVALID_GPA; + return; + } + + sequence_ptr = (__le32 __user *)(hva + offset_in_page(shmem) + + offsetof(struct sbi_sta_struct, sequence)); + steal_ptr = (__le64 __user *)(hva + offset_in_page(shmem) + + offsetof(struct sbi_sta_struct, steal)); + + if (WARN_ON(get_user(sequence_le, sequence_ptr))) + return; + + sequence = le32_to_cpu(sequence_le); + sequence += 1; + + if (WARN_ON(put_user(cpu_to_le32(sequence), sequence_ptr))) + return; + + if (!WARN_ON(get_user(steal_le, steal_ptr))) { + steal = le64_to_cpu(steal_le); + vcpu->arch.sta.last_steal = READ_ONCE(current->sched_info.run_delay); + steal += vcpu->arch.sta.last_steal - last_steal; + WARN_ON(put_user(cpu_to_le64(steal), steal_ptr)); + } + + sequence += 1; + WARN_ON(put_user(cpu_to_le32(sequence), sequence_ptr)); + + kvm_vcpu_mark_page_dirty(vcpu, gfn); +} + +static int kvm_sbi_sta_steal_time_set_shmem(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + unsigned long shmem_phys_lo = cp->a0; + unsigned long shmem_phys_hi = cp->a1; + u32 flags = cp->a2; + struct sbi_sta_struct zero_sta = {0}; + gpa_t shmem; + int ret; + + if (flags != 0) + return SBI_ERR_INVALID_PARAM; + + if (shmem_phys_lo == SBI_SHMEM_DISABLE && + shmem_phys_hi == SBI_SHMEM_DISABLE) { + vcpu->arch.sta.shmem = INVALID_GPA; + return 0; + } + + if (shmem_phys_lo & (SZ_64 - 1)) + return SBI_ERR_INVALID_PARAM; + + shmem = shmem_phys_lo; + + if (shmem_phys_hi != 0) { + if (IS_ENABLED(CONFIG_32BIT)) + shmem |= ((gpa_t)shmem_phys_hi << 32); + else + return SBI_ERR_INVALID_ADDRESS; + } + + /* No need to check writable slot explicitly as kvm_vcpu_write_guest does it internally */ + ret = kvm_vcpu_write_guest(vcpu, shmem, &zero_sta, sizeof(zero_sta)); + if (ret) + return SBI_ERR_INVALID_ADDRESS; + + vcpu->arch.sta.shmem = shmem; + vcpu->arch.sta.last_steal = current->sched_info.run_delay; + + return 0; +} + +static int kvm_sbi_ext_sta_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + unsigned long funcid = cp->a6; + int ret; + + switch (funcid) { + case SBI_EXT_STA_STEAL_TIME_SET_SHMEM: + ret = kvm_sbi_sta_steal_time_set_shmem(vcpu); + break; + default: + ret = SBI_ERR_NOT_SUPPORTED; + break; + } + + retdata->err_val = ret; + + return 0; +} + +static unsigned long kvm_sbi_ext_sta_probe(struct kvm_vcpu *vcpu) +{ + return !!sched_info_on(); +} + +static unsigned long kvm_sbi_ext_sta_get_state_reg_count(struct kvm_vcpu *vcpu) +{ + return sizeof(struct kvm_riscv_sbi_sta) / sizeof(unsigned long); +} + +static int kvm_sbi_ext_sta_get_reg(struct kvm_vcpu *vcpu, unsigned long reg_num, + unsigned long reg_size, void *reg_val) +{ + unsigned long *value; + + if (reg_size != sizeof(unsigned long)) + return -EINVAL; + value = reg_val; + + switch (reg_num) { + case KVM_REG_RISCV_SBI_STA_REG(shmem_lo): + *value = (unsigned long)vcpu->arch.sta.shmem; + break; + case KVM_REG_RISCV_SBI_STA_REG(shmem_hi): + if (IS_ENABLED(CONFIG_32BIT)) + *value = upper_32_bits(vcpu->arch.sta.shmem); + else + *value = 0; + break; + default: + return -ENOENT; + } + + return 0; +} + +static int kvm_sbi_ext_sta_set_reg(struct kvm_vcpu *vcpu, unsigned long reg_num, + unsigned long reg_size, const void *reg_val) +{ + unsigned long value; + + if (reg_size != sizeof(unsigned long)) + return -EINVAL; + value = *(const unsigned long *)reg_val; + + switch (reg_num) { + case KVM_REG_RISCV_SBI_STA_REG(shmem_lo): + if (IS_ENABLED(CONFIG_32BIT)) { + gpa_t hi = upper_32_bits(vcpu->arch.sta.shmem); + + vcpu->arch.sta.shmem = value; + vcpu->arch.sta.shmem |= hi << 32; + } else { + vcpu->arch.sta.shmem = value; + } + break; + case KVM_REG_RISCV_SBI_STA_REG(shmem_hi): + if (IS_ENABLED(CONFIG_32BIT)) { + gpa_t lo = lower_32_bits(vcpu->arch.sta.shmem); + + vcpu->arch.sta.shmem = ((gpa_t)value << 32); + vcpu->arch.sta.shmem |= lo; + } else if (value != 0) { + return -EINVAL; + } + break; + default: + return -ENOENT; + } + + return 0; +} + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_sta = { + .extid_start = SBI_EXT_STA, + .extid_end = SBI_EXT_STA, + .handler = kvm_sbi_ext_sta_handler, + .probe = kvm_sbi_ext_sta_probe, + .reset = kvm_riscv_vcpu_sbi_sta_reset, + .state_reg_subtype = KVM_REG_RISCV_SBI_STA, + .get_state_reg_count = kvm_sbi_ext_sta_get_state_reg_count, + .get_state_reg = kvm_sbi_ext_sta_get_reg, + .set_state_reg = kvm_sbi_ext_sta_set_reg, +}; diff --git a/arch/riscv/kvm/vcpu_sbi_system.c b/arch/riscv/kvm/vcpu_sbi_system.c new file mode 100644 index 000000000000..c6f7e609ac79 --- /dev/null +++ b/arch/riscv/kvm/vcpu_sbi_system.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024 Ventana Micro Systems Inc. + */ + +#include <linux/kvm_host.h> +#include <linux/wordpart.h> + +#include <asm/kvm_vcpu_sbi.h> +#include <asm/sbi.h> + +static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + unsigned long funcid = cp->a6; + unsigned long hva, i; + struct kvm_vcpu *tmp; + + switch (funcid) { + case SBI_EXT_SUSP_SYSTEM_SUSPEND: + if (lower_32_bits(cp->a0) != SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM) { + retdata->err_val = SBI_ERR_INVALID_PARAM; + return 0; + } + + if (!(cp->sstatus & SR_SPP)) { + retdata->err_val = SBI_ERR_FAILURE; + return 0; + } + + hva = kvm_vcpu_gfn_to_hva_prot(vcpu, cp->a1 >> PAGE_SHIFT, NULL); + if (kvm_is_error_hva(hva)) { + retdata->err_val = SBI_ERR_INVALID_ADDRESS; + return 0; + } + + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { + if (tmp == vcpu) + continue; + if (!kvm_riscv_vcpu_stopped(tmp)) { + retdata->err_val = SBI_ERR_DENIED; + return 0; + } + } + + kvm_riscv_vcpu_sbi_request_reset(vcpu, cp->a1, cp->a2); + + /* userspace provides the suspend implementation */ + return kvm_riscv_vcpu_sbi_forward_handler(vcpu, run, retdata); + default: + retdata->err_val = SBI_ERR_NOT_SUPPORTED; + break; + } + + return 0; +} + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_susp = { + .extid_start = SBI_EXT_SUSP, + .extid_end = SBI_EXT_SUSP, + .default_disabled = true, + .handler = kvm_sbi_ext_susp_handler, +}; diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index 489f225ee66d..188d5ea5b3b8 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -14,9 +14,7 @@ #include <asm/kvm_vcpu_sbi.h> static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, - bool *exit) + struct kvm_vcpu_sbi_return *retdata) { ulong hmask; int i, ret = 0; @@ -24,6 +22,8 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_vcpu *rvcpu; struct kvm *kvm = vcpu->kvm; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + struct kvm_cpu_trap *utrap = retdata->utrap; + unsigned long vmid; switch (cp->a7) { case SBI_EXT_0_1_CONSOLE_GETCHAR: @@ -32,8 +32,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, * The CONSOLE_GETCHAR/CONSOLE_PUTCHAR SBI calls cannot be * handled in kernel so we forward these to user-space */ - kvm_riscv_vcpu_sbi_forward(vcpu, run); - *exit = true; + ret = kvm_riscv_vcpu_sbi_forward_handler(vcpu, run, retdata); break; case SBI_EXT_0_1_SET_TIMER: #if __riscv_xlen == 32 @@ -48,8 +47,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, break; case SBI_EXT_0_1_SEND_IPI: if (cp->a0) - hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, - utrap); + hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, utrap); else hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1; if (utrap->scause) @@ -65,14 +63,13 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, case SBI_EXT_0_1_SHUTDOWN: kvm_riscv_vcpu_sbi_system_reset(vcpu, run, KVM_SYSTEM_EVENT_SHUTDOWN, 0); - *exit = true; + retdata->uexit = true; break; case SBI_EXT_0_1_REMOTE_FENCE_I: case SBI_EXT_0_1_REMOTE_SFENCE_VMA: case SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID: if (cp->a0) - hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, - utrap); + hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, utrap); else hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1; if (utrap->scause) @@ -81,29 +78,25 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I) kvm_riscv_fence_i(vcpu->kvm, 0, hmask); else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) { + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); if (cp->a1 == 0 && cp->a2 == 0) - kvm_riscv_hfence_vvma_all(vcpu->kvm, - 0, hmask); + kvm_riscv_hfence_vvma_all(vcpu->kvm, 0, hmask, vmid); else - kvm_riscv_hfence_vvma_gva(vcpu->kvm, - 0, hmask, - cp->a1, cp->a2, - PAGE_SHIFT); + kvm_riscv_hfence_vvma_gva(vcpu->kvm, 0, hmask, cp->a1, + cp->a2, PAGE_SHIFT, vmid); } else { + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); if (cp->a1 == 0 && cp->a2 == 0) - kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, - 0, hmask, - cp->a3); + kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, 0, hmask, + cp->a3, vmid); else - kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, - 0, hmask, - cp->a1, cp->a2, - PAGE_SHIFT, - cp->a3); + kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, 0, hmask, + cp->a1, cp->a2, PAGE_SHIFT, + cp->a3, vmid); } break; default: - ret = -EINVAL; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; break; } diff --git a/arch/riscv/kvm/vcpu_switch.S b/arch/riscv/kvm/vcpu_switch.S index d74df8eb4d71..47686bcb21e0 100644 --- a/arch/riscv/kvm/vcpu_switch.S +++ b/arch/riscv/kvm/vcpu_switch.S @@ -11,11 +11,7 @@ #include <asm/asm-offsets.h> #include <asm/csr.h> - .text - .altmacro - .option norelax - -ENTRY(__kvm_riscv_switch_to) +.macro SAVE_HOST_GPRS /* Save Host GPRs (except A0 and T0-T6) */ REG_S ra, (KVM_ARCH_HOST_RA)(a0) REG_S sp, (KVM_ARCH_HOST_SP)(a0) @@ -40,39 +36,33 @@ ENTRY(__kvm_riscv_switch_to) REG_S s9, (KVM_ARCH_HOST_S9)(a0) REG_S s10, (KVM_ARCH_HOST_S10)(a0) REG_S s11, (KVM_ARCH_HOST_S11)(a0) +.endm +.macro SAVE_HOST_AND_RESTORE_GUEST_CSRS __resume_addr /* Load Guest CSR values */ REG_L t0, (KVM_ARCH_GUEST_SSTATUS)(a0) - REG_L t1, (KVM_ARCH_GUEST_HSTATUS)(a0) - REG_L t2, (KVM_ARCH_GUEST_SCOUNTEREN)(a0) - la t4, __kvm_switch_return - REG_L t5, (KVM_ARCH_GUEST_SEPC)(a0) + la t1, \__resume_addr + REG_L t2, (KVM_ARCH_GUEST_SEPC)(a0) /* Save Host and Restore Guest SSTATUS */ csrrw t0, CSR_SSTATUS, t0 - /* Save Host and Restore Guest HSTATUS */ - csrrw t1, CSR_HSTATUS, t1 - - /* Save Host and Restore Guest SCOUNTEREN */ - csrrw t2, CSR_SCOUNTEREN, t2 - /* Save Host STVEC and change it to return path */ - csrrw t4, CSR_STVEC, t4 + csrrw t1, CSR_STVEC, t1 + + /* Restore Guest SEPC */ + csrw CSR_SEPC, t2 /* Save Host SSCRATCH and change it to struct kvm_vcpu_arch pointer */ csrrw t3, CSR_SSCRATCH, a0 - /* Restore Guest SEPC */ - csrw CSR_SEPC, t5 - /* Store Host CSR values */ REG_S t0, (KVM_ARCH_HOST_SSTATUS)(a0) - REG_S t1, (KVM_ARCH_HOST_HSTATUS)(a0) - REG_S t2, (KVM_ARCH_HOST_SCOUNTEREN)(a0) + REG_S t1, (KVM_ARCH_HOST_STVEC)(a0) REG_S t3, (KVM_ARCH_HOST_SSCRATCH)(a0) - REG_S t4, (KVM_ARCH_HOST_STVEC)(a0) +.endm +.macro RESTORE_GUEST_GPRS /* Restore Guest GPRs (except A0) */ REG_L ra, (KVM_ARCH_GUEST_RA)(a0) REG_L sp, (KVM_ARCH_GUEST_SP)(a0) @@ -107,13 +97,9 @@ ENTRY(__kvm_riscv_switch_to) /* Restore Guest A0 */ REG_L a0, (KVM_ARCH_GUEST_A0)(a0) +.endm - /* Resume Guest */ - sret - - /* Back to Host */ - .align 2 -__kvm_switch_return: +.macro SAVE_GUEST_GPRS /* Swap Guest A0 with SSCRATCH */ csrrw a0, CSR_SSCRATCH, a0 @@ -148,39 +134,33 @@ __kvm_switch_return: REG_S t4, (KVM_ARCH_GUEST_T4)(a0) REG_S t5, (KVM_ARCH_GUEST_T5)(a0) REG_S t6, (KVM_ARCH_GUEST_T6)(a0) +.endm +.macro SAVE_GUEST_AND_RESTORE_HOST_CSRS /* Load Host CSR values */ - REG_L t1, (KVM_ARCH_HOST_STVEC)(a0) - REG_L t2, (KVM_ARCH_HOST_SSCRATCH)(a0) - REG_L t3, (KVM_ARCH_HOST_SCOUNTEREN)(a0) - REG_L t4, (KVM_ARCH_HOST_HSTATUS)(a0) - REG_L t5, (KVM_ARCH_HOST_SSTATUS)(a0) - - /* Save Guest SEPC */ - csrr t0, CSR_SEPC + REG_L t0, (KVM_ARCH_HOST_STVEC)(a0) + REG_L t1, (KVM_ARCH_HOST_SSCRATCH)(a0) + REG_L t2, (KVM_ARCH_HOST_SSTATUS)(a0) /* Save Guest A0 and Restore Host SSCRATCH */ - csrrw t2, CSR_SSCRATCH, t2 + csrrw t1, CSR_SSCRATCH, t1 - /* Restore Host STVEC */ - csrw CSR_STVEC, t1 - - /* Save Guest and Restore Host SCOUNTEREN */ - csrrw t3, CSR_SCOUNTEREN, t3 + /* Save Guest SEPC */ + csrr t3, CSR_SEPC - /* Save Guest and Restore Host HSTATUS */ - csrrw t4, CSR_HSTATUS, t4 + /* Restore Host STVEC */ + csrw CSR_STVEC, t0 /* Save Guest and Restore Host SSTATUS */ - csrrw t5, CSR_SSTATUS, t5 + csrrw t2, CSR_SSTATUS, t2 /* Store Guest CSR values */ - REG_S t0, (KVM_ARCH_GUEST_SEPC)(a0) - REG_S t2, (KVM_ARCH_GUEST_A0)(a0) - REG_S t3, (KVM_ARCH_GUEST_SCOUNTEREN)(a0) - REG_S t4, (KVM_ARCH_GUEST_HSTATUS)(a0) - REG_S t5, (KVM_ARCH_GUEST_SSTATUS)(a0) + REG_S t1, (KVM_ARCH_GUEST_A0)(a0) + REG_S t2, (KVM_ARCH_GUEST_SSTATUS)(a0) + REG_S t3, (KVM_ARCH_GUEST_SEPC)(a0) +.endm +.macro RESTORE_HOST_GPRS /* Restore Host GPRs (except A0 and T0-T6) */ REG_L ra, (KVM_ARCH_HOST_RA)(a0) REG_L sp, (KVM_ARCH_HOST_SP)(a0) @@ -205,12 +185,69 @@ __kvm_switch_return: REG_L s9, (KVM_ARCH_HOST_S9)(a0) REG_L s10, (KVM_ARCH_HOST_S10)(a0) REG_L s11, (KVM_ARCH_HOST_S11)(a0) +.endm + + .text + .altmacro + .option norelax + + /* + * Parameters: + * A0 <= Pointer to struct kvm_vcpu_arch + */ +SYM_FUNC_START(__kvm_riscv_switch_to) + SAVE_HOST_GPRS + + SAVE_HOST_AND_RESTORE_GUEST_CSRS .Lkvm_switch_return + + RESTORE_GUEST_GPRS + + /* Resume Guest using SRET */ + sret + + /* Back to Host */ + .align 2 +.Lkvm_switch_return: + SAVE_GUEST_GPRS + + SAVE_GUEST_AND_RESTORE_HOST_CSRS + + RESTORE_HOST_GPRS + + /* Return to C code */ + ret +SYM_FUNC_END(__kvm_riscv_switch_to) + + /* + * Parameters: + * A0 <= Pointer to struct kvm_vcpu_arch + * A1 <= SBI extension ID + * A2 <= SBI function ID + */ +SYM_FUNC_START(__kvm_riscv_nacl_switch_to) + SAVE_HOST_GPRS + + SAVE_HOST_AND_RESTORE_GUEST_CSRS .Lkvm_nacl_switch_return + + /* Resume Guest using SBI nested acceleration */ + add a6, a2, zero + add a7, a1, zero + ecall + + /* Back to Host */ + .align 2 +.Lkvm_nacl_switch_return: + SAVE_GUEST_GPRS + + SAVE_GUEST_AND_RESTORE_HOST_CSRS + + RESTORE_HOST_GPRS /* Return to C code */ ret -ENDPROC(__kvm_riscv_switch_to) +SYM_FUNC_END(__kvm_riscv_nacl_switch_to) -ENTRY(__kvm_riscv_unpriv_trap) +SYM_CODE_START(__kvm_riscv_unpriv_trap) /* * We assume that faulting unpriv load/store instruction is * 4-byte long and blindly increment SEPC by 4. @@ -231,12 +268,10 @@ ENTRY(__kvm_riscv_unpriv_trap) csrr a1, CSR_HTINST REG_S a1, (KVM_ARCH_TRAP_HTINST)(a0) sret -ENDPROC(__kvm_riscv_unpriv_trap) +SYM_CODE_END(__kvm_riscv_unpriv_trap) #ifdef CONFIG_FPU - .align 3 - .global __kvm_riscv_fp_f_save -__kvm_riscv_fp_f_save: +SYM_FUNC_START(__kvm_riscv_fp_f_save) csrr t2, CSR_SSTATUS li t1, SR_FS csrs CSR_SSTATUS, t1 @@ -276,10 +311,9 @@ __kvm_riscv_fp_f_save: sw t0, KVM_ARCH_FP_F_FCSR(a0) csrw CSR_SSTATUS, t2 ret +SYM_FUNC_END(__kvm_riscv_fp_f_save) - .align 3 - .global __kvm_riscv_fp_d_save -__kvm_riscv_fp_d_save: +SYM_FUNC_START(__kvm_riscv_fp_d_save) csrr t2, CSR_SSTATUS li t1, SR_FS csrs CSR_SSTATUS, t1 @@ -319,10 +353,9 @@ __kvm_riscv_fp_d_save: sw t0, KVM_ARCH_FP_D_FCSR(a0) csrw CSR_SSTATUS, t2 ret +SYM_FUNC_END(__kvm_riscv_fp_d_save) - .align 3 - .global __kvm_riscv_fp_f_restore -__kvm_riscv_fp_f_restore: +SYM_FUNC_START(__kvm_riscv_fp_f_restore) csrr t2, CSR_SSTATUS li t1, SR_FS lw t0, KVM_ARCH_FP_F_FCSR(a0) @@ -362,10 +395,9 @@ __kvm_riscv_fp_f_restore: fscsr t0 csrw CSR_SSTATUS, t2 ret +SYM_FUNC_END(__kvm_riscv_fp_f_restore) - .align 3 - .global __kvm_riscv_fp_d_restore -__kvm_riscv_fp_d_restore: +SYM_FUNC_START(__kvm_riscv_fp_d_restore) csrr t2, CSR_SSTATUS li t1, SR_FS lw t0, KVM_ARCH_FP_D_FCSR(a0) @@ -405,4 +437,5 @@ __kvm_riscv_fp_d_restore: fscsr t0 csrw CSR_SSTATUS, t2 ret +SYM_FUNC_END(__kvm_riscv_fp_d_restore) #endif diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index ad34519c8a13..85a7262115e1 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -11,8 +11,8 @@ #include <linux/kvm_host.h> #include <linux/uaccess.h> #include <clocksource/timer-riscv.h> -#include <asm/csr.h> #include <asm/delay.h> +#include <asm/kvm_nacl.h> #include <asm/kvm_vcpu_timer.h> static u64 kvm_riscv_current_cycles(struct kvm_guest_timer *gt) @@ -72,12 +72,12 @@ static int kvm_riscv_vcpu_timer_cancel(struct kvm_vcpu_timer *t) static int kvm_riscv_vcpu_update_vstimecmp(struct kvm_vcpu *vcpu, u64 ncycles) { #if defined(CONFIG_32BIT) - csr_write(CSR_VSTIMECMP, ncycles & 0xFFFFFFFF); - csr_write(CSR_VSTIMECMPH, ncycles >> 32); + ncsr_write(CSR_VSTIMECMP, ncycles & 0xFFFFFFFF); + ncsr_write(CSR_VSTIMECMPH, ncycles >> 32); #else - csr_write(CSR_VSTIMECMP, ncycles); + ncsr_write(CSR_VSTIMECMP, ncycles); #endif - return 0; + return 0; } static int kvm_riscv_vcpu_update_hrtimer(struct kvm_vcpu *vcpu, u64 ncycles) @@ -147,10 +147,8 @@ static void kvm_riscv_vcpu_timer_blocking(struct kvm_vcpu *vcpu) return; delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t); - if (delta_ns) { - hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); - t->next_set = true; - } + hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); + t->next_set = true; } static void kvm_riscv_vcpu_timer_unblocking(struct kvm_vcpu *vcpu) @@ -172,7 +170,7 @@ int kvm_riscv_vcpu_get_reg_timer(struct kvm_vcpu *vcpu, if (KVM_REG_SIZE(reg->id) != sizeof(u64)) return -EINVAL; if (reg_num >= sizeof(struct kvm_riscv_timer) / sizeof(u64)) - return -EINVAL; + return -ENOENT; switch (reg_num) { case KVM_REG_RISCV_TIMER_REG(frequency): @@ -189,7 +187,7 @@ int kvm_riscv_vcpu_get_reg_timer(struct kvm_vcpu *vcpu, KVM_RISCV_TIMER_STATE_OFF; break; default: - return -EINVAL; + return -ENOENT; } if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) @@ -213,14 +211,15 @@ int kvm_riscv_vcpu_set_reg_timer(struct kvm_vcpu *vcpu, if (KVM_REG_SIZE(reg->id) != sizeof(u64)) return -EINVAL; if (reg_num >= sizeof(struct kvm_riscv_timer) / sizeof(u64)) - return -EINVAL; + return -ENOENT; if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) return -EFAULT; switch (reg_num) { case KVM_REG_RISCV_TIMER_REG(frequency): - ret = -EOPNOTSUPP; + if (reg_val != riscv_timebase) + return -EINVAL; break; case KVM_REG_RISCV_TIMER_REG(time): gt->time_delta = reg_val - get_cycles64(); @@ -235,7 +234,7 @@ int kvm_riscv_vcpu_set_reg_timer(struct kvm_vcpu *vcpu, ret = kvm_riscv_vcpu_timer_cancel(t); break; default: - ret = -EINVAL; + ret = -ENOENT; break; } @@ -249,18 +248,19 @@ int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu) if (t->init_done) return -EINVAL; - hrtimer_init(&t->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); t->init_done = true; t->next_set = false; /* Enable sstc for every vcpu if available in hardware */ if (riscv_isa_extension_available(NULL, SSTC)) { t->sstc_enabled = true; - t->hrt.function = kvm_riscv_vcpu_vstimer_expired; + hrtimer_setup(&t->hrt, kvm_riscv_vcpu_vstimer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); t->timer_next_event = kvm_riscv_vcpu_update_vstimecmp; } else { t->sstc_enabled = false; - t->hrt.function = kvm_riscv_vcpu_hrtimer_expired; + hrtimer_setup(&t->hrt, kvm_riscv_vcpu_hrtimer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); t->timer_next_event = kvm_riscv_vcpu_update_hrtimer; } @@ -290,10 +290,10 @@ static void kvm_riscv_vcpu_update_timedelta(struct kvm_vcpu *vcpu) struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer; #if defined(CONFIG_32BIT) - csr_write(CSR_HTIMEDELTA, (u32)(gt->time_delta)); - csr_write(CSR_HTIMEDELTAH, (u32)(gt->time_delta >> 32)); + ncsr_write(CSR_HTIMEDELTA, (u32)(gt->time_delta)); + ncsr_write(CSR_HTIMEDELTAH, (u32)(gt->time_delta >> 32)); #else - csr_write(CSR_HTIMEDELTA, gt->time_delta); + ncsr_write(CSR_HTIMEDELTA, gt->time_delta); #endif } @@ -307,10 +307,10 @@ void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu) return; #if defined(CONFIG_32BIT) - csr_write(CSR_VSTIMECMP, (u32)t->next_cycles); - csr_write(CSR_VSTIMECMPH, (u32)(t->next_cycles >> 32)); + ncsr_write(CSR_VSTIMECMP, (u32)t->next_cycles); + ncsr_write(CSR_VSTIMECMPH, (u32)(t->next_cycles >> 32)); #else - csr_write(CSR_VSTIMECMP, t->next_cycles); + ncsr_write(CSR_VSTIMECMP, t->next_cycles); #endif /* timer should be enabled for the remaining operations */ @@ -328,10 +328,10 @@ void kvm_riscv_vcpu_timer_sync(struct kvm_vcpu *vcpu) return; #if defined(CONFIG_32BIT) - t->next_cycles = csr_read(CSR_VSTIMECMP); - t->next_cycles |= (u64)csr_read(CSR_VSTIMECMPH) << 32; + t->next_cycles = ncsr_read(CSR_VSTIMECMP); + t->next_cycles |= (u64)ncsr_read(CSR_VSTIMECMPH) << 32; #else - t->next_cycles = csr_read(CSR_VSTIMECMP); + t->next_cycles = ncsr_read(CSR_VSTIMECMP); #endif } @@ -345,8 +345,24 @@ void kvm_riscv_vcpu_timer_save(struct kvm_vcpu *vcpu) /* * The vstimecmp CSRs are saved by kvm_riscv_vcpu_timer_sync() * upon every VM exit so no need to save here. + * + * If VS-timer expires when no VCPU running on a host CPU then + * WFI executed by such host CPU will be effective NOP resulting + * in no power savings. This is because as-per RISC-V Privileged + * specificaiton: "WFI is also required to resume execution for + * locally enabled interrupts pending at any privilege level, + * regardless of the global interrupt enable at each privilege + * level." + * + * To address the above issue, vstimecmp CSR must be set to -1UL + * over here when VCPU is scheduled-out or exits to user space. */ + csr_write(CSR_VSTIMECMP, -1UL); +#if defined(CONFIG_32BIT) + csr_write(CSR_VSTIMECMPH, -1UL); +#endif + /* timer should be enabled for the remaining operations */ if (unlikely(!t->init_done)) return; diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c new file mode 100644 index 000000000000..05f3cc2d8e31 --- /dev/null +++ b/arch/riscv/kvm/vcpu_vector.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 SiFive + * + * Authors: + * Vincent Chen <vincent.chen@sifive.com> + * Greentime Hu <greentime.hu@sifive.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/uaccess.h> +#include <asm/cpufeature.h> +#include <asm/kvm_vcpu_vector.h> +#include <asm/vector.h> + +#ifdef CONFIG_RISCV_ISA_V +void kvm_riscv_vcpu_vector_reset(struct kvm_vcpu *vcpu) +{ + unsigned long *isa = vcpu->arch.isa; + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + + cntx->sstatus &= ~SR_VS; + + cntx->vector.vlenb = riscv_v_vsize / 32; + + if (riscv_isa_extension_available(isa, v)) { + cntx->sstatus |= SR_VS_INITIAL; + WARN_ON(!cntx->vector.datap); + memset(cntx->vector.datap, 0, riscv_v_vsize); + } else { + cntx->sstatus |= SR_VS_OFF; + } +} + +static void kvm_riscv_vcpu_vector_clean(struct kvm_cpu_context *cntx) +{ + cntx->sstatus &= ~SR_VS; + cntx->sstatus |= SR_VS_CLEAN; +} + +void kvm_riscv_vcpu_guest_vector_save(struct kvm_cpu_context *cntx, + unsigned long *isa) +{ + if ((cntx->sstatus & SR_VS) == SR_VS_DIRTY) { + if (riscv_isa_extension_available(isa, v)) + __kvm_riscv_vector_save(cntx); + kvm_riscv_vcpu_vector_clean(cntx); + } +} + +void kvm_riscv_vcpu_guest_vector_restore(struct kvm_cpu_context *cntx, + unsigned long *isa) +{ + if ((cntx->sstatus & SR_VS) != SR_VS_OFF) { + if (riscv_isa_extension_available(isa, v)) + __kvm_riscv_vector_restore(cntx); + kvm_riscv_vcpu_vector_clean(cntx); + } +} + +void kvm_riscv_vcpu_host_vector_save(struct kvm_cpu_context *cntx) +{ + /* No need to check host sstatus as it can be modified outside */ + if (riscv_isa_extension_available(NULL, v)) + __kvm_riscv_vector_save(cntx); +} + +void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cntx) +{ + if (riscv_isa_extension_available(NULL, v)) + __kvm_riscv_vector_restore(cntx); +} + +int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.guest_context.vector.datap = kzalloc(riscv_v_vsize, GFP_KERNEL); + if (!vcpu->arch.guest_context.vector.datap) + return -ENOMEM; + + vcpu->arch.host_context.vector.datap = kzalloc(riscv_v_vsize, GFP_KERNEL); + if (!vcpu->arch.host_context.vector.datap) + return -ENOMEM; + + return 0; +} + +void kvm_riscv_vcpu_free_vector_context(struct kvm_vcpu *vcpu) +{ + kfree(vcpu->arch.guest_context.vector.datap); + kfree(vcpu->arch.host_context.vector.datap); +} +#endif + +static int kvm_riscv_vcpu_vreg_addr(struct kvm_vcpu *vcpu, + unsigned long reg_num, + size_t reg_size, + void **reg_addr) +{ + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + size_t vlenb = riscv_v_vsize / 32; + + if (reg_num < KVM_REG_RISCV_VECTOR_REG(0)) { + if (reg_size != sizeof(unsigned long)) + return -EINVAL; + switch (reg_num) { + case KVM_REG_RISCV_VECTOR_CSR_REG(vstart): + *reg_addr = &cntx->vector.vstart; + break; + case KVM_REG_RISCV_VECTOR_CSR_REG(vl): + *reg_addr = &cntx->vector.vl; + break; + case KVM_REG_RISCV_VECTOR_CSR_REG(vtype): + *reg_addr = &cntx->vector.vtype; + break; + case KVM_REG_RISCV_VECTOR_CSR_REG(vcsr): + *reg_addr = &cntx->vector.vcsr; + break; + case KVM_REG_RISCV_VECTOR_CSR_REG(vlenb): + *reg_addr = &cntx->vector.vlenb; + break; + case KVM_REG_RISCV_VECTOR_CSR_REG(datap): + default: + return -ENOENT; + } + } else if (reg_num <= KVM_REG_RISCV_VECTOR_REG(31)) { + if (reg_size != vlenb) + return -EINVAL; + *reg_addr = cntx->vector.datap + + (reg_num - KVM_REG_RISCV_VECTOR_REG(0)) * vlenb; + } else { + return -ENOENT; + } + + return 0; +} + +int kvm_riscv_vcpu_get_reg_vector(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long *isa = vcpu->arch.isa; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_VECTOR); + size_t reg_size = KVM_REG_SIZE(reg->id); + void *reg_addr; + int rc; + + if (!riscv_isa_extension_available(isa, v)) + return -ENOENT; + + rc = kvm_riscv_vcpu_vreg_addr(vcpu, reg_num, reg_size, ®_addr); + if (rc) + return rc; + + if (copy_to_user(uaddr, reg_addr, reg_size)) + return -EFAULT; + + return 0; +} + +int kvm_riscv_vcpu_set_reg_vector(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long *isa = vcpu->arch.isa; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_VECTOR); + size_t reg_size = KVM_REG_SIZE(reg->id); + void *reg_addr; + int rc; + + if (!riscv_isa_extension_available(isa, v)) + return -ENOENT; + + if (reg_num == KVM_REG_RISCV_VECTOR_CSR_REG(vlenb)) { + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + unsigned long reg_val; + + if (reg_size != sizeof(reg_val)) + return -EINVAL; + if (copy_from_user(®_val, uaddr, reg_size)) + return -EFAULT; + if (reg_val != cntx->vector.vlenb) + return -EINVAL; + + return 0; + } + + rc = kvm_riscv_vcpu_vreg_addr(vcpu, reg_num, reg_size, ®_addr); + if (rc) + return rc; + + if (copy_from_user(reg_addr, uaddr, reg_size)) + return -EFAULT; + + return 0; +} diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 65a964d7e70d..66d91ae6e9b2 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() @@ -31,16 +32,18 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int r; - r = kvm_riscv_gstage_alloc_pgd(kvm); + r = kvm_riscv_mmu_alloc_pgd(kvm); if (r) return r; r = kvm_riscv_gstage_vmid_init(kvm); if (r) { - kvm_riscv_gstage_free_pgd(kvm); + kvm_riscv_mmu_free_pgd(kvm); return r; } + kvm_riscv_aia_init_vm(kvm); + kvm_riscv_guest_timer_init(kvm); return 0; @@ -49,6 +52,123 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_destroy_vcpus(kvm); + + kvm_riscv_aia_destroy_vm(kvm); +} + +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irql, + bool line_status) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + return kvm_riscv_aia_inject_irq(kvm, irql->irq, irql->level); +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + struct kvm_msi msi; + + if (!level) + return -1; + + msi.address_lo = e->msi.address_lo; + msi.address_hi = e->msi.address_hi; + msi.data = e->msi.data; + msi.flags = e->msi.flags; + msi.devid = e->msi.devid; + + return kvm_riscv_aia_inject_msi(kvm, &msi); +} + +static int kvm_riscv_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + return kvm_riscv_aia_inject_irq(kvm, e->irqchip.pin, level); +} + +int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines) +{ + struct kvm_irq_routing_entry *ents; + int i, rc; + + ents = kcalloc(lines, sizeof(*ents), GFP_KERNEL); + if (!ents) + return -ENOMEM; + + for (i = 0; i < lines; i++) { + ents[i].gsi = i; + ents[i].type = KVM_IRQ_ROUTING_IRQCHIP; + ents[i].u.irqchip.irqchip = 0; + ents[i].u.irqchip.pin = i; + } + rc = kvm_set_irq_routing(kvm, ents, lines, 0); + kfree(ents); + + return rc; +} + +bool kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = kvm_riscv_set_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) || + (e->irqchip.irqchip >= KVM_NR_IRQCHIPS)) + goto out; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + e->msi.flags = ue->flags; + e->msi.devid = ue->u.msi.devid; + break; + default: + goto out; + } + r = 0; +out: + return r; +} + +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + if (!level) + return -EWOULDBLOCK; + + switch (e->type) { + case KVM_IRQ_ROUTING_MSI: + return kvm_set_msi(e, kvm, irq_source_id, level, line_status); + + case KVM_IRQ_ROUTING_IRQCHIP: + return kvm_riscv_set_irq(e, kvm, irq_source_id, + level, line_status); + } + + return -EWOULDBLOCK; +} + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -56,8 +176,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) int r; switch (ext) { + case KVM_CAP_IRQCHIP: + r = kvm_riscv_aia_available(); + break; case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: @@ -65,6 +187,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_MP_STATE: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_SET_GUEST_DEBUG: r = 1; break; case KVM_CAP_NR_VCPUS: @@ -77,7 +200,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_VM_GPA_BITS: - r = kvm_riscv_gstage_gpa_bits(); + r = kvm_riscv_gstage_gpa_bits; break; default: r = 0; @@ -87,8 +210,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) return r; } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +{ + switch (cap->cap) { + case KVM_CAP_RISCV_MP_STATE_RESET: + if (cap->flags) + return -EINVAL; + kvm->arch.mp_state_reset = true; + return 0; + default: + return -EINVAL; + } +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { return -EINVAL; } diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 6cd93995fb65..cf34d448289d 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -14,23 +14,23 @@ #include <linux/smp.h> #include <linux/kvm_host.h> #include <asm/csr.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_tlb.h> +#include <asm/kvm_vmid.h> static unsigned long vmid_version = 1; static unsigned long vmid_next; -static unsigned long vmid_bits; +static unsigned long vmid_bits __ro_after_init; static DEFINE_SPINLOCK(vmid_lock); -void kvm_riscv_gstage_vmid_detect(void) +void __init kvm_riscv_gstage_vmid_detect(void) { - unsigned long old; - /* Figure-out number of VMID bits in HW */ - old = csr_read(CSR_HGATP); - csr_write(CSR_HGATP, old | HGATP_VMID_MASK); + csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID); vmid_bits = csr_read(CSR_HGATP); - vmid_bits = (vmid_bits & HGATP_VMID_MASK) >> HGATP_VMID_SHIFT; + vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT; vmid_bits = fls_long(vmid_bits); - csr_write(CSR_HGATP, old); + csr_write(CSR_HGATP, 0); /* We polluted local TLB so flush all guest TLB */ kvm_riscv_local_hfence_gvma_all(); |
