diff options
Diffstat (limited to 'arch/arm64/kvm')
111 files changed, 57878 insertions, 2304 deletions
diff --git a/arch/arm64/kvm/.gitignore b/arch/arm64/kvm/.gitignore new file mode 100644 index 000000000000..6182aefb8302 --- /dev/null +++ b/arch/arm64/kvm/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +hyp_constants.h diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 21e90820bd23..4f803fd1c99a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # KVM configuration # @@ -6,7 +7,7 @@ source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" - ---help--- + help Say Y here to get to see options for using your Linux host to run other operating systems inside virtual machines (guests). This option alone does not add any kernel code. @@ -16,36 +17,70 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM +menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" - select MMU_NOTIFIER - select PREEMPT_NOTIFIERS - select ANON_INODES + select KVM_COMMON + select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_MMU_NOTIFIER + select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO - select KVM_ARM_HOST - select KVM_ARM_VGIC - select KVM_ARM_TIMER - ---help--- + select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select VIRT_XFER_TO_GUEST_WORK + select KVM_VFIO + select HAVE_KVM_DIRTY_RING_ACQ_REL + select NEED_KVM_DIRTY_RING_WITH_BITMAP + select HAVE_KVM_MSI + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_IRQ_BYPASS + select HAVE_KVM_READONLY_MEM + select HAVE_KVM_VCPU_RUN_PID_CHANGE + select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS + select KVM_GUEST_MEMFD + help Support hosting virtualized guest machines. If unsure, say N. -config KVM_ARM_HOST - bool - ---help--- - Provides host support for ARM processors. +config NVHE_EL2_DEBUG + bool "Debug mode for non-VHE EL2 object" + depends on KVM + help + Say Y here to enable the debug mode for the non-VHE KVM EL2 object. + Failure reports will BUG() in the hypervisor. This is intended for + local EL2 hypervisor development. -config KVM_ARM_VGIC - bool - depends on KVM_ARM_HOST && OF - select HAVE_KVM_IRQCHIP - ---help--- - Adds support for a hardware assisted, in-kernel GIC emulation. - -config KVM_ARM_TIMER - bool - depends on KVM_ARM_VGIC - ---help--- - Adds support for the Architected Timers in virtual machines. + If unsure, say N. + +config PROTECTED_NVHE_STACKTRACE + bool "Protected KVM hypervisor stacktraces" + depends on NVHE_EL2_DEBUG + default n + help + Say Y here to enable pKVM hypervisor stacktraces on hyp_panic() + + If using protected nVHE mode, but cannot afford the associated + memory cost (less than 0.75 page per CPU) of pKVM stacktraces, + say N. + + If unsure, or not using protected nVHE (pKVM), say N. + +config PTDUMP_STAGE2_DEBUGFS + bool "Present the stage-2 pagetables to debugfs" + depends on KVM + depends on DEBUG_KERNEL + depends on DEBUG_FS + depends on ARCH_HAS_PTDUMP + select PTDUMP + default n + help + Say Y here if you want to show the stage-2 kernel pagetables + layout in a debugfs file. This information is only useful for kernel developers + who are working in architecture specific areas of the kernel. + It is probably not a good idea to enable this feature in a production + kernel. + + If in doubt, say N. endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 72a9fd583ad3..3ebc0570345c 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -1,23 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for Kernel-based Virtual Machine module # -ccflags-y += -Ivirt/kvm -Iarch/arm64/kvm -CFLAGS_arm.o := -I. -CFLAGS_mmu.o := -I. +ccflags-y += -I $(src) -KVM=../../../virt/kvm -ARM=../../../arch/arm/kvm +include $(srctree)/virt/kvm/Makefile.kvm -obj-$(CONFIG_KVM_ARM_HOST) += kvm.o +obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM) += hyp/ -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o +CFLAGS_sys_regs.o += -Wno-override-init +CFLAGS_handle_exit.o += -Wno-override-init -kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o -kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o -kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o +kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ + inject_fault.o va_layout.o handle_exit.o config.o \ + guest.o debug.o reset.o sys_regs.o stacktrace.o \ + vgic-sys-reg-v3.o fpsimd.o pkvm.o \ + arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \ + vgic/vgic.o vgic/vgic-init.o \ + vgic/vgic-irqfd.o vgic/vgic-v2.o \ + vgic/vgic-v3.o vgic/vgic-v4.o \ + vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ + vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ + vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o \ + vgic/vgic-v5.o -kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o -kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o +kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o +kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o + +always-y := hyp_constants.h hyp-constants.s + +define rule_gen_hyp_constants + $(call filechk,offsets,__HYP_CONSTANTS_H__) +endef + +CFLAGS_hyp-constants.o = -I $(src)/hyp/include +$(obj)/hyp-constants.s: $(src)/hyp/hyp-constants.c FORCE + $(call if_changed_dep,cc_s_c) + +$(obj)/hyp_constants.h: $(obj)/hyp-constants.s FORCE + $(call if_changed_rule,gen_hyp_constants) + +obj-kvm := $(addprefix $(obj)/, $(kvm-y)) +$(obj-kvm): $(obj)/hyp_constants.h diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c new file mode 100644 index 000000000000..99a07972068d --- /dev/null +++ b/arch/arm64/kvm/arch_timer.c @@ -0,0 +1,1712 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/uaccess.h> + +#include <clocksource/arm_arch_timer.h> +#include <asm/arch_timer.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_nested.h> + +#include <kvm/arm_vgic.h> +#include <kvm/arm_arch_timer.h> + +#include "trace.h" + +static struct timecounter *timecounter; +static unsigned int host_vtimer_irq; +static unsigned int host_ptimer_irq; +static u32 host_vtimer_irq_flags; +static u32 host_ptimer_irq_flags; + +static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); +DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key); + +static const u8 default_ppi[] = { + [TIMER_PTIMER] = 30, + [TIMER_VTIMER] = 27, + [TIMER_HPTIMER] = 26, + [TIMER_HVTIMER] = 28, +}; + +static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, + struct arch_timer_context *timer_ctx); +static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val); +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg); +static bool kvm_arch_timer_get_input_level(int vintid); + +static struct irq_ops arch_timer_irq_ops = { + .get_input_level = kvm_arch_timer_get_input_level, +}; + +static int nr_timers(struct kvm_vcpu *vcpu) +{ + if (!vcpu_has_nv(vcpu)) + return NR_KVM_EL0_TIMERS; + + return NR_KVM_TIMERS; +} + +u32 timer_get_ctl(struct arch_timer_context *ctxt) +{ + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); + + switch(arch_timer_ctx_index(ctxt)) { + case TIMER_VTIMER: + return __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); + case TIMER_PTIMER: + return __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + case TIMER_HVTIMER: + return __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2); + case TIMER_HPTIMER: + return __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2); + default: + WARN_ON(1); + return 0; + } +} + +u64 timer_get_cval(struct arch_timer_context *ctxt) +{ + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); + + switch(arch_timer_ctx_index(ctxt)) { + case TIMER_VTIMER: + return __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + case TIMER_PTIMER: + return __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + case TIMER_HVTIMER: + return __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2); + case TIMER_HPTIMER: + return __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); + default: + WARN_ON(1); + return 0; + } +} + +static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) +{ + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); + + switch(arch_timer_ctx_index(ctxt)) { + case TIMER_VTIMER: + __vcpu_assign_sys_reg(vcpu, CNTV_CTL_EL0, ctl); + break; + case TIMER_PTIMER: + __vcpu_assign_sys_reg(vcpu, CNTP_CTL_EL0, ctl); + break; + case TIMER_HVTIMER: + __vcpu_assign_sys_reg(vcpu, CNTHV_CTL_EL2, ctl); + break; + case TIMER_HPTIMER: + __vcpu_assign_sys_reg(vcpu, CNTHP_CTL_EL2, ctl); + break; + default: + WARN_ON(1); + } +} + +static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) +{ + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); + + switch(arch_timer_ctx_index(ctxt)) { + case TIMER_VTIMER: + __vcpu_assign_sys_reg(vcpu, CNTV_CVAL_EL0, cval); + break; + case TIMER_PTIMER: + __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, cval); + break; + case TIMER_HVTIMER: + __vcpu_assign_sys_reg(vcpu, CNTHV_CVAL_EL2, cval); + break; + case TIMER_HPTIMER: + __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, cval); + break; + default: + WARN_ON(1); + } +} + +u64 kvm_phys_timer_read(void) +{ + return timecounter->cc->read(timecounter->cc); +} + +void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) +{ + if (vcpu_has_nv(vcpu)) { + if (is_hyp_ctxt(vcpu)) { + map->direct_vtimer = vcpu_hvtimer(vcpu); + map->direct_ptimer = vcpu_hptimer(vcpu); + map->emul_vtimer = vcpu_vtimer(vcpu); + map->emul_ptimer = vcpu_ptimer(vcpu); + } else { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_vtimer = vcpu_hvtimer(vcpu); + map->emul_ptimer = vcpu_hptimer(vcpu); + } + } else if (has_vhe()) { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_vtimer = NULL; + map->emul_ptimer = NULL; + } else { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = NULL; + map->emul_vtimer = NULL; + map->emul_ptimer = vcpu_ptimer(vcpu); + } + + trace_kvm_get_timer_map(vcpu->vcpu_id, map); +} + +static inline bool userspace_irqchip(struct kvm *kvm) +{ + return unlikely(!irqchip_in_kernel(kvm)); +} + +static void soft_timer_start(struct hrtimer *hrt, u64 ns) +{ + hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), + HRTIMER_MODE_ABS_HARD); +} + +static void soft_timer_cancel(struct hrtimer *hrt) +{ + hrtimer_cancel(hrt); +} + +static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) +{ + struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; + struct arch_timer_context *ctx; + struct timer_map map; + + /* + * We may see a timer interrupt after vcpu_put() has been called which + * sets the CPU's vcpu pointer to NULL, because even though the timer + * has been disabled in timer_save_state(), the hardware interrupt + * signal may not have been retired from the interrupt controller yet. + */ + if (!vcpu) + return IRQ_HANDLED; + + get_timer_map(vcpu, &map); + + if (irq == host_vtimer_irq) + ctx = map.direct_vtimer; + else + ctx = map.direct_ptimer; + + if (kvm_timer_should_fire(ctx)) + kvm_timer_update_irq(vcpu, true, ctx); + + if (userspace_irqchip(vcpu->kvm) && + !static_branch_unlikely(&has_gic_active_state)) + disable_percpu_irq(host_vtimer_irq); + + return IRQ_HANDLED; +} + +static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx, + u64 val) +{ + u64 now = kvm_phys_timer_read() - timer_get_offset(timer_ctx); + + if (now < val) { + u64 ns; + + ns = cyclecounter_cyc2ns(timecounter->cc, + val - now, + timecounter->mask, + &timer_ctx->ns_frac); + return ns; + } + + return 0; +} + +static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) +{ + return kvm_counter_compute_delta(timer_ctx, timer_get_cval(timer_ctx)); +} + +static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) +{ + WARN_ON(timer_ctx && timer_ctx->loaded); + return timer_ctx && + ((timer_get_ctl(timer_ctx) & + (ARCH_TIMER_CTRL_IT_MASK | ARCH_TIMER_CTRL_ENABLE)) == ARCH_TIMER_CTRL_ENABLE); +} + +static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu) +{ + return (cpus_have_final_cap(ARM64_HAS_WFXT) && + vcpu_get_flag(vcpu, IN_WFIT)); +} + +static u64 wfit_delay_ns(struct kvm_vcpu *vcpu) +{ + u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + struct arch_timer_context *ctx; + + ctx = is_hyp_ctxt(vcpu) ? vcpu_hvtimer(vcpu) : vcpu_vtimer(vcpu); + + return kvm_counter_compute_delta(ctx, val); +} + +/* + * Returns the earliest expiration time in ns among guest timers. + * Note that it will return 0 if none of timers can fire. + */ +static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) +{ + u64 min_delta = ULLONG_MAX; + int i; + + for (i = 0; i < nr_timers(vcpu); i++) { + struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; + + WARN(ctx->loaded, "timer %d loaded\n", i); + if (kvm_timer_irq_can_fire(ctx)) + min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); + } + + if (vcpu_has_wfit_active(vcpu)) + min_delta = min(min_delta, wfit_delay_ns(vcpu)); + + /* If none of timers can fire, then return 0 */ + if (min_delta == ULLONG_MAX) + return 0; + + return min_delta; +} + +static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) +{ + struct arch_timer_cpu *timer; + struct kvm_vcpu *vcpu; + u64 ns; + + timer = container_of(hrt, struct arch_timer_cpu, bg_timer); + vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); + + /* + * Check that the timer has really expired from the guest's + * PoV (NTP on the host may have forced it to expire + * early). If we should have slept longer, restart it. + */ + ns = kvm_timer_earliest_exp(vcpu); + if (unlikely(ns)) { + hrtimer_forward_now(hrt, ns_to_ktime(ns)); + return HRTIMER_RESTART; + } + + kvm_vcpu_wake_up(vcpu); + return HRTIMER_NORESTART; +} + +static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) +{ + struct arch_timer_context *ctx; + struct kvm_vcpu *vcpu; + u64 ns; + + ctx = container_of(hrt, struct arch_timer_context, hrtimer); + vcpu = timer_context_to_vcpu(ctx); + + trace_kvm_timer_hrtimer_expire(ctx); + + /* + * Check that the timer has really expired from the guest's + * PoV (NTP on the host may have forced it to expire + * early). If not ready, schedule for a later time. + */ + ns = kvm_timer_compute_delta(ctx); + if (unlikely(ns)) { + hrtimer_forward_now(hrt, ns_to_ktime(ns)); + return HRTIMER_RESTART; + } + + kvm_timer_update_irq(vcpu, true, ctx); + return HRTIMER_NORESTART; +} + +static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) +{ + enum kvm_arch_timers index; + u64 cval, now; + + if (!timer_ctx) + return false; + + index = arch_timer_ctx_index(timer_ctx); + + if (timer_ctx->loaded) { + u32 cnt_ctl = 0; + + switch (index) { + case TIMER_VTIMER: + case TIMER_HVTIMER: + cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); + break; + case TIMER_PTIMER: + case TIMER_HPTIMER: + cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); + break; + case NR_KVM_TIMERS: + /* GCC is braindead */ + cnt_ctl = 0; + break; + } + + return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && + (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && + !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); + } + + if (!kvm_timer_irq_can_fire(timer_ctx)) + return false; + + cval = timer_get_cval(timer_ctx); + now = kvm_phys_timer_read() - timer_get_offset(timer_ctx); + + return cval <= now; +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return vcpu_has_wfit_active(vcpu) && wfit_delay_ns(vcpu) == 0; +} + +/* + * Reflect the timer output level into the kvm_run structure + */ +void kvm_timer_update_run(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Populate the device bitmap with the timer states */ + regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | + KVM_ARM_DEV_EL1_PTIMER); + if (kvm_timer_should_fire(vtimer)) + regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; + if (kvm_timer_should_fire(ptimer)) + regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; +} + +static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) +{ + /* + * Paper over NV2 brokenness by publishing the interrupt status + * bit. This still results in a poor quality of emulation (guest + * writes will have no effect until the next exit). + * + * But hey, it's fast, right? + */ + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx); + if (is_hyp_ctxt(vcpu) && + (ctx == vcpu_vtimer(vcpu) || ctx == vcpu_ptimer(vcpu))) { + unsigned long val = timer_get_ctl(ctx); + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level); + timer_set_ctl(ctx, val); + } +} + +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, + struct arch_timer_context *timer_ctx) +{ + kvm_timer_update_status(timer_ctx, new_level); + + timer_ctx->irq.level = new_level; + trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), + timer_ctx->irq.level); + + if (userspace_irqchip(vcpu->kvm)) + return; + + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + timer_irq(timer_ctx), + timer_ctx->irq.level, + timer_ctx); +} + +/* Only called for a fully emulated timer */ +static void timer_emulate(struct arch_timer_context *ctx) +{ + bool should_fire = kvm_timer_should_fire(ctx); + + trace_kvm_timer_emulate(ctx, should_fire); + + if (should_fire != ctx->irq.level) + kvm_timer_update_irq(timer_context_to_vcpu(ctx), should_fire, ctx); + + kvm_timer_update_status(ctx, should_fire); + + /* + * If the timer can fire now, we don't need to have a soft timer + * scheduled for the future. If the timer cannot fire at all, + * then we also don't need a soft timer. + */ + if (should_fire || !kvm_timer_irq_can_fire(ctx)) + return; + + soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); +} + +static void set_cntvoff(u64 cntvoff) +{ + kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff); +} + +static void set_cntpoff(u64 cntpoff) +{ + if (has_cntpoff()) + write_sysreg_s(cntpoff, SYS_CNTPOFF_EL2); +} + +static void timer_save_state(struct arch_timer_context *ctx) +{ + struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx)); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); + unsigned long flags; + + if (!timer->enabled) + return; + + local_irq_save(flags); + + if (!ctx->loaded) + goto out; + + switch (index) { + u64 cval; + + case TIMER_VTIMER: + case TIMER_HVTIMER: + timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL)); + cval = read_sysreg_el0(SYS_CNTV_CVAL); + + if (has_broken_cntvoff()) + cval -= timer_get_offset(ctx); + + timer_set_cval(ctx, cval); + + /* Disable the timer */ + write_sysreg_el0(0, SYS_CNTV_CTL); + isb(); + + /* + * The kernel may decide to run userspace after + * calling vcpu_put, so we reset cntvoff to 0 to + * ensure a consistent read between user accesses to + * the virtual counter and kernel access to the + * physical counter of non-VHE case. + * + * For VHE, the virtual counter uses a fixed virtual + * offset of zero, so no need to zero CNTVOFF_EL2 + * register, but this is actually useful when switching + * between EL1/vEL2 with NV. + * + * Do it unconditionally, as this is either unavoidable + * or dirt cheap. + */ + set_cntvoff(0); + break; + case TIMER_PTIMER: + case TIMER_HPTIMER: + timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL)); + cval = read_sysreg_el0(SYS_CNTP_CVAL); + + cval -= timer_get_offset(ctx); + + timer_set_cval(ctx, cval); + + /* Disable the timer */ + write_sysreg_el0(0, SYS_CNTP_CTL); + isb(); + + set_cntpoff(0); + break; + case NR_KVM_TIMERS: + BUG(); + } + + trace_kvm_timer_save_state(ctx); + + ctx->loaded = false; +out: + local_irq_restore(flags); +} + +/* + * Schedule the background timer before calling kvm_vcpu_halt, so that this + * thread is removed from its waitqueue and made runnable when there's a timer + * interrupt to handle. + */ +static void kvm_timer_blocking(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * If no timers are capable of raising interrupts (disabled or + * masked), then there's no more work for us to do. + */ + if (!kvm_timer_irq_can_fire(map.direct_vtimer) && + !kvm_timer_irq_can_fire(map.direct_ptimer) && + !kvm_timer_irq_can_fire(map.emul_vtimer) && + !kvm_timer_irq_can_fire(map.emul_ptimer) && + !vcpu_has_wfit_active(vcpu)) + return; + + /* + * At least one guest time will expire. Schedule a background timer. + * Set the earliest expiration time among the guest timers. + */ + soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); +} + +static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + soft_timer_cancel(&timer->bg_timer); +} + +static void timer_restore_state(struct arch_timer_context *ctx) +{ + struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx)); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); + unsigned long flags; + + if (!timer->enabled) + return; + + local_irq_save(flags); + + if (ctx->loaded) + goto out; + + switch (index) { + u64 cval, offset; + + case TIMER_VTIMER: + case TIMER_HVTIMER: + cval = timer_get_cval(ctx); + offset = timer_get_offset(ctx); + if (has_broken_cntvoff()) { + set_cntvoff(0); + cval += offset; + } else { + set_cntvoff(offset); + } + write_sysreg_el0(cval, SYS_CNTV_CVAL); + isb(); + write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL); + break; + case TIMER_PTIMER: + case TIMER_HPTIMER: + cval = timer_get_cval(ctx); + offset = timer_get_offset(ctx); + set_cntpoff(offset); + cval += offset; + write_sysreg_el0(cval, SYS_CNTP_CVAL); + isb(); + write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL); + break; + case NR_KVM_TIMERS: + BUG(); + } + + trace_kvm_timer_restore_state(ctx); + + ctx->loaded = true; +out: + local_irq_restore(flags); +} + +static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) +{ + int r; + r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active); + WARN_ON(r); +} + +static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) +{ + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx); + bool phys_active = false; + + /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(ctx), ctx); + + if (irqchip_in_kernel(vcpu->kvm)) + phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx)); + + phys_active |= ctx->irq.level; + + set_timer_irq_phys_active(ctx, phys_active); +} + +static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer); + + /* + * When using a userspace irqchip with the architected timers and a + * host interrupt controller that doesn't support an active state, we + * must still prevent continuously exiting from the guest, and + * therefore mask the physical interrupt by disabling it on the host + * interrupt controller when the virtual level is high, such that the + * guest can make forward progress. Once we detect the output level + * being de-asserted, we unmask the interrupt again so that we exit + * from the guest when the timer fires. + */ + if (vtimer->irq.level) + disable_percpu_irq(host_vtimer_irq); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); +} + +/* If _pred is true, set bit in _set, otherwise set it in _clr */ +#define assign_clear_set_bit(_pred, _bit, _clr, _set) \ + do { \ + if (_pred) \ + (_set) |= (_bit); \ + else \ + (_clr) |= (_bit); \ + } while (0) + +static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, + struct timer_map *map) +{ + int hw, ret; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + /* + * We only ever unmap the vtimer irq on a VHE system that runs nested + * virtualization, in which case we have both a valid emul_vtimer, + * emul_ptimer, direct_vtimer, and direct_ptimer. + * + * Since this is called from kvm_timer_vcpu_load(), a change between + * vEL2 and vEL1/0 will have just happened, and the timer_map will + * represent this, and therefore we switch the emul/direct mappings + * below. + */ + hw = kvm_vgic_get_map(vcpu, timer_irq(map->direct_vtimer)); + if (hw < 0) { + kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_vtimer)); + kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_ptimer)); + + ret = kvm_vgic_map_phys_irq(vcpu, + map->direct_vtimer->host_timer_irq, + timer_irq(map->direct_vtimer), + &arch_timer_irq_ops); + WARN_ON_ONCE(ret); + ret = kvm_vgic_map_phys_irq(vcpu, + map->direct_ptimer->host_timer_irq, + timer_irq(map->direct_ptimer), + &arch_timer_irq_ops); + WARN_ON_ONCE(ret); + } +} + +static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) +{ + bool tvt, tpt, tvc, tpc, tvt02, tpt02; + u64 clr, set; + + /* + * No trapping gets configured here with nVHE. See + * __timer_enable_traps(), which is where the stuff happens. + */ + if (!has_vhe()) + return; + + /* + * Our default policy is not to trap anything. As we progress + * within this function, reality kicks in and we start adding + * traps based on emulation requirements. + */ + tvt = tpt = tvc = tpc = false; + tvt02 = tpt02 = false; + + /* + * NV2 badly breaks the timer semantics by redirecting accesses to + * the EL1 timer state to memory, so let's call ECV to the rescue if + * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses. + * + * The treatment slightly varies depending whether we run a nVHE or + * VHE guest: nVHE will use the _EL0 registers directly, while VHE + * will use the _EL02 accessors. This translates in different trap + * bits. + * + * None of the trapping is required when running in non-HYP context, + * unless required by the L1 hypervisor settings once we advertise + * ECV+NV in the guest, or that we need trapping for other reasons. + */ + if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) { + if (vcpu_el2_e2h_is_set(vcpu)) + tvt02 = tpt02 = true; + else + tvt = tpt = true; + } + + /* + * We have two possibility to deal with a physical offset: + * + * - Either we have CNTPOFF (yay!) or the offset is 0: + * we let the guest freely access the HW + * + * - or neither of these condition apply: + * we trap accesses to the HW, but still use it + * after correcting the physical offset + */ + if (!has_cntpoff() && timer_get_offset(map->direct_ptimer)) + tpt = tpc = true; + + /* + * For the poor sods that could not correctly subtract one value + * from another, trap the full virtual timer and counter. + */ + if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer)) + tvt = tvc = true; + + /* + * Apply the enable bits that the guest hypervisor has requested for + * its own guest. We can only add traps that wouldn't have been set + * above. + * Implementation choices: we do not support NV when E2H=0 in the + * guest, and we don't support configuration where E2H is writable + * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but + * not both). This simplifies the handling of the EL1NV* bits. + */ + if (is_nested_ctxt(vcpu)) { + u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + /* Use the VHE format for mental sanity */ + if (!vcpu_el2_e2h_is_set(vcpu)) + val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10; + + tpt |= !(val & (CNTHCTL_EL1PCEN << 10)); + tpc |= !(val & (CNTHCTL_EL1PCTEN << 10)); + + tpt02 |= (val & CNTHCTL_EL1NVPCT); + tvt02 |= (val & CNTHCTL_EL1NVVCT); + } + + /* + * Now that we have collected our requirements, compute the + * trap and enable bits. + */ + set = 0; + clr = 0; + + assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr); + assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr); + assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set); + assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set); + assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set); + assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set); + + /* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */ + sysreg_clear_set(cnthctl_el2, clr, set); +} + +void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + if (unlikely(!timer->enabled)) + return; + + get_timer_map(vcpu, &map); + + if (static_branch_likely(&has_gic_active_state)) { + if (vcpu_has_nv(vcpu)) + kvm_timer_vcpu_load_nested_switch(vcpu, &map); + + kvm_timer_vcpu_load_gic(map.direct_vtimer); + if (map.direct_ptimer) + kvm_timer_vcpu_load_gic(map.direct_ptimer); + } else { + kvm_timer_vcpu_load_nogic(vcpu); + } + + kvm_timer_unblocking(vcpu); + + timer_restore_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_restore_state(map.direct_ptimer); + if (map.emul_vtimer) + timer_emulate(map.emul_vtimer); + if (map.emul_ptimer) + timer_emulate(map.emul_ptimer); + + timer_set_traps(vcpu, &map); +} + +bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool vlevel, plevel; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; + plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; + + return kvm_timer_should_fire(vtimer) != vlevel || + kvm_timer_should_fire(ptimer) != plevel; +} + +void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + if (unlikely(!timer->enabled)) + return; + + get_timer_map(vcpu, &map); + + timer_save_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_save_state(map.direct_ptimer); + + /* + * Cancel soft timer emulation, because the only case where we + * need it after a vcpu_put is in the context of a sleeping VCPU, and + * in that case we already factor in the deadline for the physical + * timer when scheduling the bg_timer. + * + * In any case, we re-schedule the hrtimer for the physical timer when + * coming back to the VCPU thread in kvm_timer_vcpu_load(). + */ + if (map.emul_vtimer) + soft_timer_cancel(&map.emul_vtimer->hrtimer); + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); + + if (kvm_vcpu_is_blocking(vcpu)) + kvm_timer_blocking(vcpu); +} + +void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) +{ + /* + * When NV2 is on, guest hypervisors have their EL1 timer register + * accesses redirected to the VNCR page. Any guest action taken on + * the timer is postponed until the next exit, leading to a very + * poor quality of emulation. + * + * This is an unmitigated disaster, only papered over by FEAT_ECV, + * which allows trapping of the timer registers even with NV2. + * Still, this is still worse than FEAT_NV on its own. Meh. + */ + if (!cpus_have_final_cap(ARM64_HAS_ECV)) { + /* + * For a VHE guest hypervisor, the EL2 state is directly + * stored in the host EL1 timers, while the emulated EL1 + * state is stored in the VNCR page. The latter could have + * been updated behind our back, and we must reset the + * emulation of the timers. + * + * A non-VHE guest hypervisor doesn't have any direct access + * to its timers: the EL2 registers trap despite being + * notionally direct (we use the EL1 HW, as for VHE), while + * the EL1 registers access memory. + * + * In both cases, process the emulated timers on each guest + * exit. Boo. + */ + struct timer_map map; + get_timer_map(vcpu, &map); + + soft_timer_cancel(&map.emul_vtimer->hrtimer); + soft_timer_cancel(&map.emul_ptimer->hrtimer); + timer_emulate(map.emul_vtimer); + timer_emulate(map.emul_ptimer); + } +} + +/* + * With a userspace irqchip we have to check if the guest de-asserted the + * timer and if so, unmask the timer irq signal on the host interrupt + * controller to ensure that we see future timer signals. + */ +static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + if (!kvm_timer_should_fire(vtimer)) { + kvm_timer_update_irq(vcpu, false, vtimer); + if (static_branch_likely(&has_gic_active_state)) + set_timer_irq_phys_active(vtimer, false); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + } +} + +void kvm_timer_sync_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + if (unlikely(!timer->enabled)) + return; + + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + unmask_vtimer_irq_user(vcpu); +} + +void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 + * and to 0 for ARMv7. We provide an implementation that always + * resets the timer to be disabled and unmasked and is compliant with + * the ARMv7 architecture. + */ + for (int i = 0; i < nr_timers(vcpu); i++) + timer_set_ctl(vcpu_get_timer(vcpu, i), 0); + + /* + * A vcpu running at EL2 is in charge of the offset applied to + * the virtual timer, so use the physical VM offset, and point + * the vcpu offset to CNTVOFF_EL2. + */ + if (vcpu_has_nv(vcpu)) { + struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset; + + offs->vcpu_offset = __ctxt_sys_reg(&vcpu->arch.ctxt, CNTVOFF_EL2); + offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset; + } + + if (timer->enabled) { + for (int i = 0; i < nr_timers(vcpu); i++) + kvm_timer_update_irq(vcpu, false, + vcpu_get_timer(vcpu, i)); + + if (irqchip_in_kernel(vcpu->kvm)) { + kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_vtimer)); + if (map.direct_ptimer) + kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_ptimer)); + } + } + + if (map.emul_vtimer) + soft_timer_cancel(&map.emul_vtimer->hrtimer); + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); +} + +static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) +{ + struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid); + struct kvm *kvm = vcpu->kvm; + + ctxt->timer_id = timerid; + + if (timerid == TIMER_VTIMER) + ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; + else + ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; + + hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + + switch (timerid) { + case TIMER_PTIMER: + case TIMER_HPTIMER: + ctxt->host_timer_irq = host_ptimer_irq; + break; + case TIMER_VTIMER: + case TIMER_HVTIMER: + ctxt->host_timer_irq = host_vtimer_irq; + break; + } +} + +void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + for (int i = 0; i < NR_KVM_TIMERS; i++) + timer_context_init(vcpu, i); + + /* Synchronize offsets across timers of a VM if not already provided */ + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { + timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read()); + timer_set_offset(vcpu_ptimer(vcpu), 0); + } + + hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); +} + +void kvm_timer_init_vm(struct kvm *kvm) +{ + for (int i = 0; i < NR_KVM_TIMERS; i++) + kvm->arch.timer_data.ppi[i] = default_ppi[i]; +} + +void kvm_timer_cpu_up(void) +{ + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + if (host_ptimer_irq) + enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); +} + +void kvm_timer_cpu_down(void) +{ + disable_percpu_irq(host_vtimer_irq); + if (host_ptimer_irq) + disable_percpu_irq(host_ptimer_irq); +} + +static u64 read_timer_ctl(struct arch_timer_context *timer) +{ + /* + * Set ISTATUS bit if it's expired. + * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is + * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit + * regardless of ENABLE bit for our implementation convenience. + */ + u32 ctl = timer_get_ctl(timer); + + if (!kvm_timer_compute_delta(timer)) + ctl |= ARCH_TIMER_CTRL_IT_STAT; + + return ctl; +} + +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + switch (treg) { + case TIMER_REG_TVAL: + val = timer_get_cval(timer) - kvm_phys_timer_read() + timer_get_offset(timer); + val = lower_32_bits(val); + break; + + case TIMER_REG_CTL: + val = read_timer_ctl(timer); + break; + + case TIMER_REG_CVAL: + val = timer_get_cval(timer); + break; + + case TIMER_REG_CNT: + val = kvm_phys_timer_read() - timer_get_offset(timer); + break; + + case TIMER_REG_VOFF: + val = *timer->offset.vcpu_offset; + break; + + default: + BUG(); + } + + return val; +} + +u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg) +{ + struct arch_timer_context *timer; + struct timer_map map; + u64 val; + + get_timer_map(vcpu, &map); + timer = vcpu_get_timer(vcpu, tmr); + + if (timer == map.emul_vtimer || timer == map.emul_ptimer) + return kvm_arm_timer_read(vcpu, timer, treg); + + preempt_disable(); + timer_save_state(timer); + + val = kvm_arm_timer_read(vcpu, timer, treg); + + timer_restore_state(timer); + preempt_enable(); + + return val; +} + +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val) +{ + switch (treg) { + case TIMER_REG_TVAL: + timer_set_cval(timer, kvm_phys_timer_read() - timer_get_offset(timer) + (s32)val); + break; + + case TIMER_REG_CTL: + timer_set_ctl(timer, val & ~ARCH_TIMER_CTRL_IT_STAT); + break; + + case TIMER_REG_CVAL: + timer_set_cval(timer, val); + break; + + case TIMER_REG_VOFF: + *timer->offset.vcpu_offset = val; + break; + + default: + BUG(); + } +} + +void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg, + u64 val) +{ + struct arch_timer_context *timer; + struct timer_map map; + + get_timer_map(vcpu, &map); + timer = vcpu_get_timer(vcpu, tmr); + if (timer == map.emul_vtimer || timer == map.emul_ptimer) { + soft_timer_cancel(&timer->hrtimer); + kvm_arm_timer_write(vcpu, timer, treg, val); + timer_emulate(timer); + } else { + preempt_disable(); + timer_save_state(timer); + kvm_arm_timer_write(vcpu, timer, treg, val); + timer_restore_state(timer); + preempt_enable(); + } +} + +static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) +{ + if (vcpu) + irqd_set_forwarded_to_vcpu(d); + else + irqd_clr_forwarded_to_vcpu(d); + + return 0; +} + +static int timer_irq_set_irqchip_state(struct irq_data *d, + enum irqchip_irq_state which, bool val) +{ + if (which != IRQCHIP_STATE_ACTIVE || !irqd_is_forwarded_to_vcpu(d)) + return irq_chip_set_parent_state(d, which, val); + + if (val) + irq_chip_mask_parent(d); + else + irq_chip_unmask_parent(d); + + return 0; +} + +static void timer_irq_eoi(struct irq_data *d) +{ + if (!irqd_is_forwarded_to_vcpu(d)) + irq_chip_eoi_parent(d); +} + +static void timer_irq_ack(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_ack) + d->chip->irq_ack(d); +} + +static struct irq_chip timer_chip = { + .name = "KVM", + .irq_ack = timer_irq_ack, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = timer_irq_eoi, + .irq_set_type = irq_chip_set_type_parent, + .irq_set_vcpu_affinity = timer_irq_set_vcpu_affinity, + .irq_set_irqchip_state = timer_irq_set_irqchip_state, +}; + +static int timer_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + irq_hw_number_t hwirq = (uintptr_t)arg; + + return irq_domain_set_hwirq_and_chip(domain, virq, hwirq, + &timer_chip, NULL); +} + +static void timer_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ +} + +static const struct irq_domain_ops timer_domain_ops = { + .alloc = timer_irq_domain_alloc, + .free = timer_irq_domain_free, +}; + +static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags) +{ + *flags = irq_get_trigger_type(virq); + if (*flags != IRQF_TRIGGER_HIGH && *flags != IRQF_TRIGGER_LOW) { + kvm_err("Invalid trigger for timer IRQ%d, assuming level low\n", + virq); + *flags = IRQF_TRIGGER_LOW; + } +} + +static int kvm_irq_init(struct arch_timer_kvm_info *info) +{ + struct irq_domain *domain = NULL; + + if (info->virtual_irq <= 0) { + kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", + info->virtual_irq); + return -ENODEV; + } + + host_vtimer_irq = info->virtual_irq; + kvm_irq_fixup_flags(host_vtimer_irq, &host_vtimer_irq_flags); + + if (kvm_vgic_global_state.no_hw_deactivation) { + struct fwnode_handle *fwnode; + struct irq_data *data; + + fwnode = irq_domain_alloc_named_fwnode("kvm-timer"); + if (!fwnode) + return -ENOMEM; + + /* Assume both vtimer and ptimer in the same parent */ + data = irq_get_irq_data(host_vtimer_irq); + domain = irq_domain_create_hierarchy(data->domain, 0, + NR_KVM_TIMERS, fwnode, + &timer_domain_ops, NULL); + if (!domain) { + irq_domain_free_fwnode(fwnode); + return -ENOMEM; + } + + arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE; + WARN_ON(irq_domain_push_irq(domain, host_vtimer_irq, + (void *)TIMER_VTIMER)); + } + + if (info->physical_irq > 0) { + host_ptimer_irq = info->physical_irq; + kvm_irq_fixup_flags(host_ptimer_irq, &host_ptimer_irq_flags); + + if (domain) + WARN_ON(irq_domain_push_irq(domain, host_ptimer_irq, + (void *)TIMER_PTIMER)); + } + + return 0; +} + +static void kvm_timer_handle_errata(void) +{ + u64 mmfr0, mmfr1, mmfr4; + + /* + * CNTVOFF_EL2 is broken on some implementations. For those, we trap + * all virtual timer/counter accesses, requiring FEAT_ECV. + * + * However, a hypervisor supporting nesting is likely to mitigate the + * erratum at L0, and not require other levels to mitigate it (which + * would otherwise be a terrible performance sink due to trap + * amplification). + * + * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0, + * and that NV is likely not to (because of limitations of the + * architecture), only enable the workaround when FEAT_VHE and + * FEAT_E2H0 are both detected. Time will tell if this actually holds. + */ + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1); + if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) && + !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) && + SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) && + (has_vhe() || has_hvhe()) && + cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) { + static_branch_enable(&broken_cntvoff_key); + kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n"); + } +} + +int __init kvm_timer_hyp_init(bool has_gic) +{ + struct arch_timer_kvm_info *info; + int err; + + info = arch_timer_get_kvm_info(); + timecounter = &info->timecounter; + + if (!timecounter->cc) { + kvm_err("kvm_arch_timer: uninitialized timecounter\n"); + return -ENODEV; + } + + err = kvm_irq_init(info); + if (err) + return err; + + /* First, do the virtual EL1 timer irq */ + + err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, + "kvm guest vtimer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n", + host_vtimer_irq, err); + return err; + } + + if (has_gic) { + err = irq_set_vcpu_affinity(host_vtimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_vtimer_irq; + } + + static_branch_enable(&has_gic_active_state); + } + + kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); + + /* Now let's do the physical EL1 timer irq */ + + if (info->physical_irq > 0) { + err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler, + "kvm guest ptimer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", + host_ptimer_irq, err); + goto out_free_vtimer_irq; + } + + if (has_gic) { + err = irq_set_vcpu_affinity(host_ptimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_ptimer_irq; + } + } + + kvm_debug("physical timer IRQ%d\n", host_ptimer_irq); + } else if (has_vhe()) { + kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", + info->physical_irq); + err = -ENODEV; + goto out_free_vtimer_irq; + } + + kvm_timer_handle_errata(); + return 0; + +out_free_ptimer_irq: + if (info->physical_irq > 0) + free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus()); +out_free_vtimer_irq: + free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); + return err; +} + +void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + soft_timer_cancel(&timer->bg_timer); +} + +static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) +{ + u32 ppis = 0; + bool valid; + + mutex_lock(&vcpu->kvm->arch.config_lock); + + for (int i = 0; i < nr_timers(vcpu); i++) { + struct arch_timer_context *ctx; + int irq; + + ctx = vcpu_get_timer(vcpu, i); + irq = timer_irq(ctx); + if (kvm_vgic_set_owner(vcpu, irq, ctx)) + break; + + /* + * We know by construction that we only have PPIs, so + * all values are less than 32. + */ + ppis |= BIT(irq); + } + + valid = hweight32(ppis) == nr_timers(vcpu); + + if (valid) + set_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags); + + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return valid; +} + +static bool kvm_arch_timer_get_input_level(int vintid) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + if (WARN(!vcpu, "No vcpu context!\n")) + return false; + + for (int i = 0; i < nr_timers(vcpu); i++) { + struct arch_timer_context *ctx; + + ctx = vcpu_get_timer(vcpu, i); + if (timer_irq(ctx) == vintid) + return kvm_timer_should_fire(ctx); + } + + /* A timer IRQ has fired, but no matching timer was found? */ + WARN_RATELIMIT(1, "timer INTID%d unknown\n", vintid); + + return false; +} + +int kvm_timer_enable(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + int ret; + + if (timer->enabled) + return 0; + + /* Without a VGIC we do not map virtual IRQs to physical IRQs */ + if (!irqchip_in_kernel(vcpu->kvm)) + goto no_vgic; + + /* + * At this stage, we have the guarantee that the vgic is both + * available and initialized. + */ + if (!timer_irqs_are_valid(vcpu)) { + kvm_debug("incorrectly configured timer irqs\n"); + return -EINVAL; + } + + get_timer_map(vcpu, &map); + + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_vtimer->host_timer_irq, + timer_irq(map.direct_vtimer), + &arch_timer_irq_ops); + if (ret) + return ret; + + if (map.direct_ptimer) { + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_ptimer->host_timer_irq, + timer_irq(map.direct_ptimer), + &arch_timer_irq_ops); + } + + if (ret) + return ret; + +no_vgic: + timer->enabled = 1; + return 0; +} + +/* If we have CNTPOFF, permanently set ECV to enable it */ +void kvm_timer_init_vhe(void) +{ + if (cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)) + sysreg_clear_set(cnthctl_el2, 0, CNTHCTL_ECV); +} + +int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + int __user *uaddr = (int __user *)(long)attr->addr; + int irq, idx, ret = 0; + + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; + + if (get_user(irq, uaddr)) + return -EFAULT; + + if (!(irq_is_ppi(irq))) + return -EINVAL; + + mutex_lock(&vcpu->kvm->arch.config_lock); + + if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, + &vcpu->kvm->arch.flags)) { + ret = -EBUSY; + goto out; + } + + switch (attr->attr) { + case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: + idx = TIMER_VTIMER; + break; + case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + idx = TIMER_PTIMER; + break; + case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: + idx = TIMER_HVTIMER; + break; + case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: + idx = TIMER_HPTIMER; + break; + default: + ret = -ENXIO; + goto out; + } + + /* + * We cannot validate the IRQ unicity before we run, so take it at + * face value. The verdict will be given on first vcpu run, for each + * vcpu. Yes this is late. Blame it on the stupid API. + */ + vcpu->kvm->arch.timer_data.ppi[idx] = irq; + +out: + mutex_unlock(&vcpu->kvm->arch.config_lock); + return ret; +} + +int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + int __user *uaddr = (int __user *)(long)attr->addr; + struct arch_timer_context *timer; + int irq; + + switch (attr->attr) { + case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: + timer = vcpu_vtimer(vcpu); + break; + case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + timer = vcpu_ptimer(vcpu); + break; + case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: + timer = vcpu_hvtimer(vcpu); + break; + case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: + timer = vcpu_hptimer(vcpu); + break; + default: + return -ENXIO; + } + + irq = timer_irq(timer); + return put_user(irq, uaddr); +} + +int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: + case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: + case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: + return 0; + } + + return -ENXIO; +} + +int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, + struct kvm_arm_counter_offset *offset) +{ + int ret = 0; + + if (offset->reserved) + return -EINVAL; + + mutex_lock(&kvm->lock); + + if (!kvm_trylock_all_vcpus(kvm)) { + set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags); + + /* + * If userspace decides to set the offset using this + * API rather than merely restoring the counter + * values, the offset applies to both the virtual and + * physical views. + */ + kvm->arch.timer_data.voffset = offset->counter_offset; + kvm->arch.timer_data.poffset = offset->counter_offset; + + kvm_unlock_all_vcpus(kvm); + } else { + ret = -EBUSY; + } + + mutex_unlock(&kvm->lock); + + return ret; +} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c new file mode 100644 index 000000000000..4f80da0c0d1d --- /dev/null +++ b/arch/arm64/kvm/arm.c @@ -0,0 +1,3012 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + */ + +#include <linux/bug.h> +#include <linux/cpu_pm.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/kvm.h> +#include <linux/kvm_irqfd.h> +#include <linux/irqbypass.h> +#include <linux/sched/stat.h> +#include <linux/psci.h> +#include <trace/events/kvm.h> + +#define CREATE_TRACE_POINTS +#include "trace_arm.h" + +#include <linux/uaccess.h> +#include <asm/ptrace.h> +#include <asm/mman.h> +#include <asm/tlbflush.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/virt.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> +#include <asm/kvm_pkvm.h> +#include <asm/kvm_ptrauth.h> +#include <asm/sections.h> + +#include <kvm/arm_hypercalls.h> +#include <kvm/arm_pmu.h> +#include <kvm/arm_psci.h> + +#include "sys_regs.h" + +static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; + +enum kvm_wfx_trap_policy { + KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */ + KVM_WFX_NOTRAP, + KVM_WFX_TRAP, +}; + +static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; +static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; + +DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); + +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base); +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); + +DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); + +static bool vgic_present, kvm_arm_initialised; + +static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized); + +bool is_kvm_arm_initialised(void) +{ + return kvm_arm_initialised; +} + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; +} + +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + int r = -EINVAL; + + if (cap->flags) + return -EINVAL; + + if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap)) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_ARM_NISV_TO_USER: + r = 0; + set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, + &kvm->arch.flags); + break; + case KVM_CAP_ARM_MTE: + mutex_lock(&kvm->lock); + if (system_supports_mte() && !kvm->created_vcpus) { + r = 0; + set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_ARM_SYSTEM_SUSPEND: + r = 0; + set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); + break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + mutex_lock(&kvm->slots_lock); + /* + * To keep things simple, allow changing the chunk + * size only when no memory slots have been created. + */ + if (kvm_are_all_memslots_empty(kvm)) { + u64 new_cap = cap->args[0]; + + if (!new_cap || kvm_is_block_size_supported(new_cap)) { + r = 0; + kvm->arch.mmu.split_page_chunk_size = new_cap; + } + } + mutex_unlock(&kvm->slots_lock); + break; + case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + r = 0; + set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags); + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_ARM_SEA_TO_USER: + r = 0; + set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags); + break; + default: + break; + } + + return r; +} + +static int kvm_arm_default_max_vcpus(void) +{ + return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; +} + +/** + * kvm_arch_init_vm - initializes a VM data structure + * @kvm: pointer to the KVM struct + * @type: kvm device type + */ +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + int ret; + + mutex_init(&kvm->arch.config_lock); + +#ifdef CONFIG_LOCKDEP + /* Clue in lockdep that the config_lock must be taken inside kvm->lock */ + mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); + mutex_unlock(&kvm->arch.config_lock); + mutex_unlock(&kvm->lock); +#endif + + kvm_init_nested(kvm); + + ret = kvm_share_hyp(kvm, kvm + 1); + if (ret) + return ret; + + if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) { + ret = -ENOMEM; + goto err_unshare_kvm; + } + cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask); + + ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type); + if (ret) + goto err_free_cpumask; + + if (is_protected_kvm_enabled()) { + /* + * If any failures occur after this is successful, make sure to + * call __pkvm_unreserve_vm to unreserve the VM in hyp. + */ + ret = pkvm_init_host_vm(kvm); + if (ret) + goto err_free_cpumask; + } + + kvm_vgic_early_init(kvm); + + kvm_timer_init_vm(kvm); + + /* The maximum number of VCPUs is limited by the host's GIC model */ + kvm->max_vcpus = kvm_arm_default_max_vcpus(); + + kvm_arm_init_hypercalls(kvm); + + bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); + + return 0; + +err_free_cpumask: + free_cpumask_var(kvm->arch.supported_cpus); +err_unshare_kvm: + kvm_unshare_hyp(kvm, kvm + 1); + return ret; +} + +vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +void kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + kvm_sys_regs_create_debugfs(kvm); + kvm_s2_ptdump_create_debugfs(kvm); +} + +static void kvm_destroy_mpidr_data(struct kvm *kvm) +{ + struct kvm_mpidr_data *data; + + mutex_lock(&kvm->arch.config_lock); + + data = rcu_dereference_protected(kvm->arch.mpidr_data, + lockdep_is_held(&kvm->arch.config_lock)); + if (data) { + rcu_assign_pointer(kvm->arch.mpidr_data, NULL); + synchronize_rcu(); + kfree(data); + } + + mutex_unlock(&kvm->arch.config_lock); +} + +/** + * kvm_arch_destroy_vm - destroy the VM data structure + * @kvm: pointer to the KVM struct + */ +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + bitmap_free(kvm->arch.pmu_filter); + free_cpumask_var(kvm->arch.supported_cpus); + + kvm_vgic_destroy(kvm); + + if (is_protected_kvm_enabled()) + pkvm_destroy_hyp_vm(kvm); + + kvm_destroy_mpidr_data(kvm); + + kfree(kvm->arch.sysreg_masks); + kvm_destroy_vcpus(kvm); + + kvm_unshare_hyp(kvm, kvm + 1); + + kvm_arm_teardown_hypercalls(kvm); +} + +static bool kvm_has_full_ptr_auth(void) +{ + bool apa, gpa, api, gpi, apa3, gpa3; + u64 isar1, isar2, val; + + /* + * Check that: + * + * - both Address and Generic auth are implemented for a given + * algorithm (Q5, IMPDEF or Q3) + * - only a single algorithm is implemented. + */ + if (!system_has_full_ptr_auth()) + return false; + + isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); + isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + + apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1); + val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1); + gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP); + + api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1); + val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1); + gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP); + + apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2); + val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2); + gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP); + + return (apa == gpa && api == gpi && apa3 == gpa3 && + (apa + api + apa3) == 1); +} + +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +{ + int r; + + if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext)) + return 0; + + switch (ext) { + case KVM_CAP_IRQCHIP: + r = vgic_present; + break; + case KVM_CAP_IOEVENTFD: + case KVM_CAP_USER_MEMORY: + case KVM_CAP_SYNC_MMU: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + case KVM_CAP_ONE_REG: + case KVM_CAP_ARM_PSCI: + case KVM_CAP_ARM_PSCI_0_2: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_MP_STATE: + case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: + case KVM_CAP_ARM_NISV_TO_USER: + case KVM_CAP_ARM_INJECT_EXT_DABT: + case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_VCPU_ATTRIBUTES: + case KVM_CAP_PTP_KVM: + case KVM_CAP_ARM_SYSTEM_SUSPEND: + case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_COUNTER_OFFSET: + case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + case KVM_CAP_ARM_SEA_TO_USER: + r = 1; + break; + case KVM_CAP_SET_GUEST_DEBUG2: + return KVM_GUESTDBG_VALID_MASK; + case KVM_CAP_ARM_SET_DEVICE_ADDR: + r = 1; + break; + case KVM_CAP_NR_VCPUS: + /* + * ARM64 treats KVM_CAP_NR_CPUS differently from all other + * architectures, as it does not always bound it to + * KVM_CAP_MAX_VCPUS. It should not matter much because + * this is just an advisory value. + */ + r = min_t(unsigned int, num_online_cpus(), + kvm_arm_default_max_vcpus()); + break; + case KVM_CAP_MAX_VCPUS: + case KVM_CAP_MAX_VCPU_ID: + if (kvm) + r = kvm->max_vcpus; + else + r = kvm_arm_default_max_vcpus(); + break; + case KVM_CAP_MSI_DEVID: + if (!kvm) + r = -EINVAL; + else + r = kvm->arch.vgic.msis_require_devid; + break; + case KVM_CAP_ARM_USER_IRQ: + /* + * 1: EL1_VTIMER, EL1_PTIMER, and PMU. + * (bump this number if adding more devices) + */ + r = 1; + break; + case KVM_CAP_ARM_MTE: + r = system_supports_mte(); + break; + case KVM_CAP_STEAL_TIME: + r = kvm_arm_pvtime_supported(); + break; + case KVM_CAP_ARM_EL1_32BIT: + r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1); + break; + case KVM_CAP_ARM_EL2: + r = cpus_have_final_cap(ARM64_HAS_NESTED_VIRT); + break; + case KVM_CAP_ARM_EL2_E2H0: + r = cpus_have_final_cap(ARM64_HAS_HCR_NV1); + break; + case KVM_CAP_GUEST_DEBUG_HW_BPS: + r = get_num_brps(); + break; + case KVM_CAP_GUEST_DEBUG_HW_WPS: + r = get_num_wrps(); + break; + case KVM_CAP_ARM_PMU_V3: + r = kvm_supports_guest_pmuv3(); + break; + case KVM_CAP_ARM_INJECT_SERROR_ESR: + r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); + break; + case KVM_CAP_ARM_VM_IPA_SIZE: + r = get_kvm_ipa_limit(); + break; + case KVM_CAP_ARM_SVE: + r = system_supports_sve(); + break; + case KVM_CAP_ARM_PTRAUTH_ADDRESS: + case KVM_CAP_ARM_PTRAUTH_GENERIC: + r = kvm_has_full_ptr_auth(); + break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + if (kvm) + r = kvm->arch.mmu.split_page_chunk_size; + else + r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + break; + case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES: + r = kvm_supported_block_sizes(); + break; + case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES: + r = BIT(0); + break; + case KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED: + if (!kvm) + r = -EINVAL; + else + r = kvm_supports_cacheable_pfnmap(); + break; + + default: + r = 0; + } + + return r; +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -EINVAL; +} + +struct kvm *kvm_arch_alloc_vm(void) +{ + size_t sz = sizeof(struct kvm); + + if (!has_vhe()) + return kzalloc(sz, GFP_KERNEL_ACCOUNT); + + return kvzalloc(sz, GFP_KERNEL_ACCOUNT); +} + +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) + return -EBUSY; + + if (id >= kvm->max_vcpus) + return -EINVAL; + + return 0; +} + +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + int err; + + spin_lock_init(&vcpu->arch.mp_state_lock); + +#ifdef CONFIG_LOCKDEP + /* Inform lockdep that the config_lock is acquired after vcpu->mutex */ + mutex_lock(&vcpu->mutex); + mutex_lock(&vcpu->kvm->arch.config_lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); + mutex_unlock(&vcpu->mutex); +#endif + + /* Force users to call KVM_ARM_VCPU_INIT */ + vcpu_clear_flag(vcpu, VCPU_INITIALIZED); + + vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; + + /* Set up the timer */ + kvm_timer_vcpu_init(vcpu); + + kvm_pmu_vcpu_init(vcpu); + + kvm_arm_pvtime_vcpu_init(&vcpu->arch); + + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + + /* + * This vCPU may have been created after mpidr_data was initialized. + * Throw out the pre-computed mappings if that is the case which forces + * KVM to fall back to iteratively searching the vCPUs. + */ + kvm_destroy_mpidr_data(vcpu->kvm); + + err = kvm_vgic_vcpu_init(vcpu); + if (err) + return err; + + err = kvm_share_hyp(vcpu, vcpu + 1); + if (err) + kvm_vgic_vcpu_destroy(vcpu); + + return err; +} + +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + if (!is_protected_kvm_enabled()) + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + else + free_hyp_memcache(&vcpu->arch.pkvm_memcache); + kvm_timer_vcpu_terminate(vcpu); + kvm_pmu_vcpu_destroy(vcpu); + kvm_vgic_vcpu_destroy(vcpu); + kvm_arm_vcpu_destroy(vcpu); +} + +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + +} + +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + +} + +static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) { + /* + * Either we're running an L2 guest, and the API/APK bits come + * from L1's HCR_EL2, or API/APK are both set. + */ + if (unlikely(is_nested_ctxt(vcpu))) { + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + val &= (HCR_API | HCR_APK); + vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK); + vcpu->arch.hcr_el2 |= val; + } else { + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + } + + /* + * Save the host keys if there is any chance for the guest + * to use pauth, as the entry code will reload the guest + * keys in that case. + */ + if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) { + struct kvm_cpu_context *ctxt; + + ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt); + ptrauth_save_keys(ctxt); + } + } +} + +static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) + return kvm_wfi_trap_policy == KVM_WFX_NOTRAP; + + return single_task_running() && + (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) || + vcpu->kvm->arch.vgic.nassgireq); +} + +static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) + return kvm_wfe_trap_policy == KVM_WFX_NOTRAP; + + return single_task_running(); +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct kvm_s2_mmu *mmu; + int *last_ran; + + if (is_protected_kvm_enabled()) + goto nommu; + + if (vcpu_has_nv(vcpu)) + kvm_vcpu_load_hw_mmu(vcpu); + + mmu = vcpu->arch.hw_mmu; + last_ran = this_cpu_ptr(mmu->last_vcpu_ran); + + /* + * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2, + * which happens eagerly in VHE. + * + * Also, the VMID allocator only preserves VMIDs that are active at the + * time of rollover, so KVM might need to grab a new VMID for the MMU if + * this is called from kvm_sched_in(). + */ + kvm_arm_vmid_update(&mmu->vmid); + + /* + * We guarantee that both TLBs and I-cache are private to each + * vcpu. If detecting that a vcpu from the same VM has + * previously run on the same physical CPU, call into the + * hypervisor code to nuke the relevant contexts. + * + * We might get preempted before the vCPU actually runs, but + * over-invalidation doesn't affect correctness. + */ + if (*last_ran != vcpu->vcpu_idx) { + kvm_call_hyp(__kvm_flush_cpu_context, mmu); + *last_ran = vcpu->vcpu_idx; + } + +nommu: + vcpu->cpu = cpu; + + /* + * The timer must be loaded before the vgic to correctly set up physical + * interrupt deactivation in nested state (e.g. timer interrupt). + */ + kvm_timer_vcpu_load(vcpu); + kvm_vgic_load(vcpu); + kvm_vcpu_load_debug(vcpu); + kvm_vcpu_load_fgt(vcpu); + if (has_vhe()) + kvm_vcpu_load_vhe(vcpu); + kvm_arch_vcpu_load_fp(vcpu); + kvm_vcpu_pmu_restore_guest(vcpu); + if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) + kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); + + if (kvm_vcpu_should_clear_twe(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_TWE; + else + vcpu->arch.hcr_el2 |= HCR_TWE; + + if (kvm_vcpu_should_clear_twi(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_TWI; + else + vcpu->arch.hcr_el2 |= HCR_TWI; + + vcpu_set_pauth_traps(vcpu); + + if (is_protected_kvm_enabled()) { + kvm_call_hyp_nvhe(__pkvm_vcpu_load, + vcpu->kvm->arch.pkvm.handle, + vcpu->vcpu_idx, vcpu->arch.hcr_el2); + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + } + + if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) + vcpu_set_on_unsupported_cpu(vcpu); +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + if (is_protected_kvm_enabled()) { + kvm_call_hyp(__vgic_v3_save_aprs, &vcpu->arch.vgic_cpu.vgic_v3); + kvm_call_hyp_nvhe(__pkvm_vcpu_put); + } + + kvm_vcpu_put_debug(vcpu); + kvm_arch_vcpu_put_fp(vcpu); + if (has_vhe()) + kvm_vcpu_put_vhe(vcpu); + kvm_timer_vcpu_put(vcpu); + kvm_vgic_put(vcpu); + kvm_vcpu_pmu_restore_host(vcpu); + if (vcpu_has_nv(vcpu)) + kvm_vcpu_put_hw_mmu(vcpu); + kvm_arm_vmid_clear_active(); + + vcpu_clear_on_unsupported_cpu(vcpu); + vcpu->cpu = -1; +} + +static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) +{ + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); + kvm_make_request(KVM_REQ_SLEEP, vcpu); + kvm_vcpu_kick(vcpu); +} + +void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->arch.mp_state_lock); + __kvm_arm_vcpu_power_off(vcpu); + spin_unlock(&vcpu->arch.mp_state_lock); +} + +bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu) +{ + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED; +} + +static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED); + kvm_make_request(KVM_REQ_SUSPEND, vcpu); + kvm_vcpu_kick(vcpu); +} + +static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu) +{ + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED; +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + *mp_state = READ_ONCE(vcpu->arch.mp_state); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + int ret = 0; + + spin_lock(&vcpu->arch.mp_state_lock); + + switch (mp_state->mp_state) { + case KVM_MP_STATE_RUNNABLE: + WRITE_ONCE(vcpu->arch.mp_state, *mp_state); + break; + case KVM_MP_STATE_STOPPED: + __kvm_arm_vcpu_power_off(vcpu); + break; + case KVM_MP_STATE_SUSPENDED: + kvm_arm_vcpu_suspend(vcpu); + break; + default: + ret = -EINVAL; + } + + spin_unlock(&vcpu->arch.mp_state_lock); + + return ret; +} + +/** + * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled + * @v: The VCPU pointer + * + * If the guest CPU is not waiting for interrupts or an interrupt line is + * asserted, the CPU is by definition runnable. + */ +int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) +{ + bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF | HCR_VSE); + + return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) + && !kvm_arm_vcpu_stopped(v) && !v->arch.pause); +} + +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu_mode_priv(vcpu); +} + +#ifdef CONFIG_GUEST_PERF_EVENTS +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return *vcpu_pc(vcpu); +} +#endif + +static void kvm_init_mpidr_data(struct kvm *kvm) +{ + struct kvm_mpidr_data *data = NULL; + unsigned long c, mask, nr_entries; + u64 aff_set = 0, aff_clr = ~0UL; + struct kvm_vcpu *vcpu; + + mutex_lock(&kvm->arch.config_lock); + + if (rcu_access_pointer(kvm->arch.mpidr_data) || + atomic_read(&kvm->online_vcpus) == 1) + goto out; + + kvm_for_each_vcpu(c, vcpu, kvm) { + u64 aff = kvm_vcpu_get_mpidr_aff(vcpu); + aff_set |= aff; + aff_clr &= aff; + } + + /* + * A significant bit can be either 0 or 1, and will only appear in + * aff_set. Use aff_clr to weed out the useless stuff. + */ + mask = aff_set ^ aff_clr; + nr_entries = BIT_ULL(hweight_long(mask)); + + /* + * Don't let userspace fool us. If we need more than a single page + * to describe the compressed MPIDR array, just fall back to the + * iterative method. Single vcpu VMs do not need this either. + */ + if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE) + data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries), + GFP_KERNEL_ACCOUNT); + + if (!data) + goto out; + + data->mpidr_mask = mask; + + kvm_for_each_vcpu(c, vcpu, kvm) { + u64 aff = kvm_vcpu_get_mpidr_aff(vcpu); + u16 index = kvm_mpidr_index(data, aff); + + data->cmpidr_to_idx[index] = c; + } + + rcu_assign_pointer(kvm->arch.mpidr_data, data); +out: + mutex_unlock(&kvm->arch.config_lock); +} + +/* + * Handle both the initialisation that is being done when the vcpu is + * run for the first time, as well as the updates that must be + * performed each time we get a new thread dealing with this vcpu. + */ +int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret; + + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + + if (!kvm_arm_vcpu_is_finalized(vcpu)) + return -EPERM; + + if (likely(vcpu_has_run_once(vcpu))) + return 0; + + kvm_init_mpidr_data(kvm); + + if (likely(irqchip_in_kernel(kvm))) { + /* + * Map the VGIC hardware resources before running a vcpu the + * first time on this VM. + */ + ret = kvm_vgic_map_resources(kvm); + if (ret) + return ret; + } + + ret = kvm_finalize_sys_regs(vcpu); + if (ret) + return ret; + + if (vcpu_has_nv(vcpu)) { + ret = kvm_vcpu_allocate_vncr_tlb(vcpu); + if (ret) + return ret; + + ret = kvm_vgic_vcpu_nv_init(vcpu); + if (ret) + return ret; + } + + /* + * This needs to happen after any restriction has been applied + * to the feature set. + */ + kvm_calculate_traps(vcpu); + + ret = kvm_timer_enable(vcpu); + if (ret) + return ret; + + if (kvm_vcpu_has_pmu(vcpu)) { + ret = kvm_arm_pmu_v3_enable(vcpu); + if (ret) + return ret; + } + + if (is_protected_kvm_enabled()) { + ret = pkvm_create_hyp_vm(kvm); + if (ret) + return ret; + + ret = pkvm_create_hyp_vcpu(vcpu); + if (ret) + return ret; + } + + mutex_lock(&kvm->arch.config_lock); + set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); + mutex_unlock(&kvm->arch.config_lock); + + return ret; +} + +bool kvm_arch_intc_initialized(struct kvm *kvm) +{ + return vgic_initialized(kvm); +} + +void kvm_arm_halt_guest(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.pause = true; + kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP); +} + +void kvm_arm_resume_guest(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.pause = false; + __kvm_vcpu_wake_up(vcpu); + } +} + +static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu) +{ + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); + + rcuwait_wait_event(wait, + (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause), + TASK_INTERRUPTIBLE); + + if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) { + /* Awaken to handle a signal, request we sleep again later. */ + kvm_make_request(KVM_REQ_SLEEP, vcpu); + } + + /* + * Make sure we will observe a potential reset request if we've + * observed a change to the power state. Pairs with the smp_wmb() in + * kvm_psci_vcpu_on(). + */ + smp_rmb(); +} + +/** + * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior + * @vcpu: The VCPU pointer + * + * Suspend execution of a vCPU until a valid wake event is detected, i.e. until + * the vCPU is runnable. The vCPU may or may not be scheduled out, depending + * on when a wake event arrives, e.g. there may already be a pending wake event. + */ +void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) +{ + /* + * Sync back the state of the GIC CPU interface so that we have + * the latest PMR and group enables. This ensures that + * kvm_arch_vcpu_runnable has up-to-date data to decide whether + * we have pending interrupts, e.g. when determining if the + * vCPU should block. + * + * For the same reason, we want to tell GICv4 that we need + * doorbells to be signalled, should an interrupt become pending. + */ + preempt_disable(); + vcpu_set_flag(vcpu, IN_WFI); + kvm_vgic_put(vcpu); + preempt_enable(); + + kvm_vcpu_halt(vcpu); + vcpu_clear_flag(vcpu, IN_WFIT); + + preempt_disable(); + vcpu_clear_flag(vcpu, IN_WFI); + kvm_vgic_load(vcpu); + preempt_enable(); +} + +static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + if (!kvm_arm_vcpu_suspended(vcpu)) + return 1; + + kvm_vcpu_wfi(vcpu); + + /* + * The suspend state is sticky; we do not leave it until userspace + * explicitly marks the vCPU as runnable. Request that we suspend again + * later. + */ + kvm_make_request(KVM_REQ_SUSPEND, vcpu); + + /* + * Check to make sure the vCPU is actually runnable. If so, exit to + * userspace informing it of the wakeup condition. + */ + if (kvm_arch_vcpu_runnable(vcpu)) { + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + return 0; + } + + /* + * Otherwise, we were unblocked to process a different event, such as a + * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to + * process the event. + */ + return 1; +} + +/** + * check_vcpu_requests - check and handle pending vCPU requests + * @vcpu: the VCPU pointer + * + * Return: 1 if we should enter the guest + * 0 if we should exit to userspace + * < 0 if we should exit to userspace, where the return value indicates + * an error + */ +static int check_vcpu_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + + if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) + kvm_vcpu_sleep(vcpu); + + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + + /* + * Clear IRQ_PENDING requests that were made to guarantee + * that a VCPU sees new virtual interrupts. + */ + kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); + + /* Process interrupts deactivated through a trap */ + if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu)) + kvm_vgic_process_async_update(vcpu); + + if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) + kvm_update_stolen_time(vcpu); + + if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { + /* The distributor enable bits were changed */ + preempt_disable(); + vgic_v4_put(vcpu); + vgic_v4_load(vcpu); + preempt_enable(); + } + + if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) + kvm_vcpu_reload_pmu(vcpu); + + if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu)) + kvm_vcpu_pmu_restore_guest(vcpu); + + if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) + return kvm_vcpu_suspend(vcpu); + + if (kvm_dirty_ring_check_request(vcpu)) + return 0; + + check_nested_vcpu_requests(vcpu); + } + + return 1; +} + +static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) +{ + if (likely(!vcpu_mode_is_32bit(vcpu))) + return false; + + if (vcpu_has_nv(vcpu)) + return true; + + return !kvm_supports_32bit_el0(); +} + +/** + * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest + * @vcpu: The VCPU pointer + * @ret: Pointer to write optional return code + * + * Returns: true if the VCPU needs to return to a preemptible + interruptible + * and skip guest entry. + * + * This function disambiguates between two different types of exits: exits to a + * preemptible + interruptible kernel context and exits to userspace. For an + * exit to userspace, this function will write the return code to ret and return + * true. For an exit to preemptible + interruptible kernel context (i.e. check + * for pending work and re-enter), return true without writing to ret. + */ +static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) +{ + struct kvm_run *run = vcpu->run; + + /* + * If we're using a userspace irqchip, then check if we need + * to tell a userspace irqchip about timer or PMU level + * changes and if so, exit to userspace (the actual level + * state gets updated in kvm_timer_update_run and + * kvm_pmu_update_run below). + */ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { + if (kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { + *ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + return true; + } + } + + if (unlikely(vcpu_on_unsupported_cpu(vcpu))) { + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED; + run->fail_entry.cpu = smp_processor_id(); + *ret = 0; + return true; + } + + return kvm_request_pending(vcpu) || + xfer_to_guest_mode_work_pending(); +} + +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_enter_irqoff(); + ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); + guest_state_exit_irqoff(); + + return ret; +} + +/** + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code + * @vcpu: The VCPU pointer + * + * This function is called through the VCPU_RUN ioctl called from user space. It + * will execute VM code in a loop until the time slice for the process is used + * or some emulation is needed from user space in which case the function will + * return with return value 0 and with the kvm_run structure filled in with the + * required data for the requested emulation. + */ +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + int ret; + + if (run->exit_reason == KVM_EXIT_MMIO) { + ret = kvm_handle_mmio_return(vcpu); + if (ret <= 0) + return ret; + } + + vcpu_load(vcpu); + + if (!vcpu->wants_to_run) { + ret = -EINTR; + goto out; + } + + kvm_sigset_activate(vcpu); + + ret = 1; + run->exit_reason = KVM_EXIT_UNKNOWN; + run->flags = 0; + while (ret > 0) { + /* + * Check conditions before entering the guest + */ + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); + if (!ret) + ret = 1; + + if (ret > 0) + ret = check_vcpu_requests(vcpu); + + /* + * Preparing the interrupts to be injected also + * involves poking the GIC, which must be done in a + * non-preemptible context. + */ + preempt_disable(); + + kvm_nested_flush_hwstate(vcpu); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_flush_hwstate(vcpu); + + local_irq_disable(); + + kvm_vgic_flush_hwstate(vcpu); + + kvm_pmu_update_vcpu_events(vcpu); + + /* + * Ensure we set mode to IN_GUEST_MODE after we disable + * interrupts and before the final VCPU requests check. + * See the comment in kvm_vcpu_exiting_guest_mode() and + * Documentation/virt/kvm/vcpu-requests.rst + */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); + + if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { + vcpu->mode = OUTSIDE_GUEST_MODE; + isb(); /* Ensure work in x_flush_hwstate is committed */ + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_sync_hwstate(vcpu); + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + kvm_timer_sync_user(vcpu); + kvm_vgic_sync_hwstate(vcpu); + local_irq_enable(); + preempt_enable(); + continue; + } + + kvm_arch_vcpu_ctxflush_fp(vcpu); + + /************************************************************** + * Enter the guest + */ + trace_kvm_entry(*vcpu_pc(vcpu)); + guest_timing_enter_irqoff(); + + ret = kvm_arm_vcpu_enter_exit(vcpu); + + vcpu->mode = OUTSIDE_GUEST_MODE; + vcpu->stat.exits++; + /* + * Back from guest + *************************************************************/ + + /* + * We must sync the PMU state before the vgic state so + * that the vgic can properly sample the updated state of the + * interrupt line. + */ + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_sync_hwstate(vcpu); + + /* + * Sync the vgic state before syncing the timer state because + * the timer code needs to know if the virtual timer + * interrupts are active. + */ + kvm_vgic_sync_hwstate(vcpu); + + /* + * Sync the timer hardware state before enabling interrupts as + * we don't want vtimer interrupts to race with syncing the + * timer virtual interrupt state. + */ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + kvm_timer_sync_user(vcpu); + + if (is_hyp_ctxt(vcpu)) + kvm_timer_sync_nested(vcpu); + + kvm_arch_vcpu_ctxsync_fp(vcpu); + + /* + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. + * + * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other + * context synchronization event) is necessary to ensure that + * pending interrupts are taken. + */ + if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) { + local_irq_enable(); + isb(); + local_irq_disable(); + } + + guest_timing_exit_irqoff(); + + local_irq_enable(); + + trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + + /* Exit types that need handling before we can be preempted */ + handle_exit_early(vcpu, ret); + + kvm_nested_sync_hwstate(vcpu); + + preempt_enable(); + + /* + * The ARMv8 architecture doesn't give the hypervisor + * a mechanism to prevent a guest from dropping to AArch32 EL0 + * if implemented by the CPU. If we spot the guest in such + * state and that we decided it wasn't supposed to do so (like + * with the asymmetric AArch32 case), return to userspace with + * a fatal error. + */ + if (vcpu_mode_is_bad_32bit(vcpu)) { + /* + * As we have caught the guest red-handed, decide that + * it isn't fit for purpose anymore by making the vcpu + * invalid. The VMM can try and fix it by issuing a + * KVM_ARM_VCPU_INIT if it really wants to. + */ + vcpu_clear_flag(vcpu, VCPU_INITIALIZED); + ret = ARM_EXCEPTION_IL; + } + + ret = handle_exit(vcpu, ret); + } + + /* Tell userspace about in-kernel device output levels */ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { + kvm_timer_update_run(vcpu); + kvm_pmu_update_run(vcpu); + } + + kvm_sigset_deactivate(vcpu); + +out: + /* + * In the unlikely event that we are returning to userspace + * with pending exceptions or PC adjustment, commit these + * adjustments in order to give userspace a consistent view of + * the vcpu state. Note that this relies on __kvm_adjust_pc() + * being preempt-safe on VHE. + */ + if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) || + vcpu_get_flag(vcpu, INCREMENT_PC))) + kvm_call_hyp(__kvm_adjust_pc, vcpu); + + vcpu_put(vcpu); + return ret; +} + +static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) +{ + int bit_index; + bool set; + unsigned long *hcr; + + if (number == KVM_ARM_IRQ_CPU_IRQ) + bit_index = __ffs(HCR_VI); + else /* KVM_ARM_IRQ_CPU_FIQ */ + bit_index = __ffs(HCR_VF); + + hcr = vcpu_hcr(vcpu); + if (level) + set = test_and_set_bit(bit_index, hcr); + else + set = test_and_clear_bit(bit_index, hcr); + + /* + * If we didn't change anything, no need to wake up or kick other CPUs + */ + if (set == level) + return 0; + + /* + * The vcpu irq_lines field was updated, wake up sleeping VCPUs and + * trigger a world-switch round on the running physical CPU to set the + * virtual IRQ/FIQ fields in the HCR appropriately. + */ + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return 0; +} + +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, + bool line_status) +{ + u32 irq = irq_level->irq; + unsigned int irq_type, vcpu_id, irq_num; + struct kvm_vcpu *vcpu = NULL; + bool level = irq_level->level; + + irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; + vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; + vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); + irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; + + trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level); + + switch (irq_type) { + case KVM_ARM_IRQ_TYPE_CPU: + if (irqchip_in_kernel(kvm)) + return -ENXIO; + + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + if (!vcpu) + return -EINVAL; + + if (irq_num > KVM_ARM_IRQ_CPU_FIQ) + return -EINVAL; + + return vcpu_interrupt_line(vcpu, irq_num, level); + case KVM_ARM_IRQ_TYPE_PPI: + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + if (!vcpu) + return -EINVAL; + + if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL); + case KVM_ARM_IRQ_TYPE_SPI: + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + if (irq_num < VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL); + } + + return -EINVAL; +} + +static unsigned long system_supported_vcpu_features(void) +{ + unsigned long features = KVM_VCPU_VALID_FEATURES; + + if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1)) + clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features); + + if (!kvm_supports_guest_pmuv3()) + clear_bit(KVM_ARM_VCPU_PMU_V3, &features); + + if (!system_supports_sve()) + clear_bit(KVM_ARM_VCPU_SVE, &features); + + if (!kvm_has_full_ptr_auth()) { + clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features); + clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features); + } + + if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT)) + clear_bit(KVM_ARM_VCPU_HAS_EL2, &features); + + return features; +} + +static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned long features = init->features[0]; + int i; + + if (features & ~KVM_VCPU_VALID_FEATURES) + return -ENOENT; + + for (i = 1; i < ARRAY_SIZE(init->features); i++) { + if (init->features[i]) + return -ENOENT; + } + + if (features & ~system_supported_vcpu_features()) + return -EINVAL; + + /* + * For now make sure that both address/generic pointer authentication + * features are requested by the userspace together. + */ + if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) != + test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features)) + return -EINVAL; + + if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) + return 0; + + /* MTE is incompatible with AArch32 */ + if (kvm_has_mte(vcpu->kvm)) + return -EINVAL; + + /* NV is incompatible with AArch32 */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features)) + return -EINVAL; + + return 0; +} + +static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned long features = init->features[0]; + + return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features, + KVM_VCPU_MAX_FEATURES); +} + +static int kvm_setup_vcpu(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret = 0; + + /* + * When the vCPU has a PMU, but no PMU is set for the guest + * yet, set the default one. + */ + if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu) + ret = kvm_arm_set_default_pmu(kvm); + + /* Prepare for nested if required */ + if (!ret && vcpu_has_nv(vcpu)) + ret = kvm_vcpu_init_nested(vcpu); + + return ret; +} + +static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned long features = init->features[0]; + struct kvm *kvm = vcpu->kvm; + int ret = -EINVAL; + + mutex_lock(&kvm->arch.config_lock); + + if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) && + kvm_vcpu_init_changed(vcpu, init)) + goto out_unlock; + + bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES); + + ret = kvm_setup_vcpu(vcpu); + if (ret) + goto out_unlock; + + /* Now we know what it is, we can reset it. */ + kvm_reset_vcpu(vcpu); + + set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags); + vcpu_set_flag(vcpu, VCPU_INITIALIZED); + ret = 0; +out_unlock: + mutex_unlock(&kvm->arch.config_lock); + return ret; +} + +static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + int ret; + + if (init->target != KVM_ARM_TARGET_GENERIC_V8 && + init->target != kvm_target_cpu()) + return -EINVAL; + + ret = kvm_vcpu_init_check_features(vcpu, init); + if (ret) + return ret; + + if (!kvm_vcpu_initialized(vcpu)) + return __kvm_vcpu_set_target(vcpu, init); + + if (kvm_vcpu_init_changed(vcpu, init)) + return -EINVAL; + + kvm_reset_vcpu(vcpu); + return 0; +} + +static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, + struct kvm_vcpu_init *init) +{ + bool power_off = false; + int ret; + + /* + * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid + * reflecting it in the finalized feature set, thus limiting its scope + * to a single KVM_ARM_VCPU_INIT call. + */ + if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) { + init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF); + power_off = true; + } + + ret = kvm_vcpu_set_target(vcpu, init); + if (ret) + return ret; + + /* + * Ensure a rebooted VM will fault in RAM pages and detect if the + * guest MMU is turned off and flush the caches as needed. + * + * S2FWB enforces all memory accesses to RAM being cacheable, + * ensuring that the data side is always coherent. We still + * need to invalidate the I-cache though, as FWB does *not* + * imply CTR_EL0.DIC. + */ + if (vcpu_has_run_once(vcpu)) { + if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + stage2_unmap_vm(vcpu->kvm); + else + icache_inval_all_pou(); + } + + vcpu_reset_hcr(vcpu); + + /* + * Handle the "start in power-off" case. + */ + spin_lock(&vcpu->arch.mp_state_lock); + + if (power_off) + __kvm_arm_vcpu_power_off(vcpu); + else + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); + + spin_unlock(&vcpu->arch.mp_state_lock); + + return 0; +} + +static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + default: + ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr); + break; + } + + return ret; +} + +static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + default: + ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr); + break; + } + + return ret; +} + +static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + default: + ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr); + break; + } + + return ret; +} + +static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + memset(events, 0, sizeof(*events)); + + return __kvm_arm_vcpu_get_events(vcpu, events); +} + +static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + int i; + + /* check whether the reserved field is zero */ + for (i = 0; i < ARRAY_SIZE(events->reserved); i++) + if (events->reserved[i]) + return -EINVAL; + + /* check whether the pad field is zero */ + for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) + if (events->exception.pad[i]) + return -EINVAL; + + return __kvm_arm_vcpu_set_events(vcpu, events); +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + struct kvm_device_attr attr; + long r; + + switch (ioctl) { + case KVM_ARM_VCPU_INIT: { + struct kvm_vcpu_init init; + + r = -EFAULT; + if (copy_from_user(&init, argp, sizeof(init))) + break; + + r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); + break; + } + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: { + struct kvm_one_reg reg; + + r = -ENOEXEC; + if (unlikely(!kvm_vcpu_initialized(vcpu))) + break; + + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + break; + + /* + * We could owe a reset due to PSCI. Handle the pending reset + * here to ensure userspace register accesses are ordered after + * the reset. + */ + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + + if (ioctl == KVM_SET_ONE_REG) + r = kvm_arm_set_reg(vcpu, ®); + else + r = kvm_arm_get_reg(vcpu, ®); + break; + } + case KVM_GET_REG_LIST: { + struct kvm_reg_list __user *user_list = argp; + struct kvm_reg_list reg_list; + unsigned n; + + r = -ENOEXEC; + if (unlikely(!kvm_vcpu_initialized(vcpu))) + break; + + r = -EPERM; + if (!kvm_arm_vcpu_is_finalized(vcpu)) + break; + + r = -EFAULT; + if (copy_from_user(®_list, user_list, sizeof(reg_list))) + break; + n = reg_list.n; + reg_list.n = kvm_arm_num_regs(vcpu); + if (copy_to_user(user_list, ®_list, sizeof(reg_list))) + break; + r = -E2BIG; + if (n < reg_list.n) + break; + r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); + break; + } + case KVM_SET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_arm_vcpu_set_attr(vcpu, &attr); + break; + } + case KVM_GET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_arm_vcpu_get_attr(vcpu, &attr); + break; + } + case KVM_HAS_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_arm_vcpu_has_attr(vcpu, &attr); + break; + } + case KVM_GET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + + if (kvm_arm_vcpu_get_events(vcpu, &events)) + return -EINVAL; + + if (copy_to_user(argp, &events, sizeof(events))) + return -EFAULT; + + return 0; + } + case KVM_SET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + + if (copy_from_user(&events, argp, sizeof(events))) + return -EFAULT; + + return kvm_arm_vcpu_set_events(vcpu, &events); + } + case KVM_ARM_VCPU_FINALIZE: { + int what; + + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + + if (get_user(what, (const int __user *)argp)) + return -EFAULT; + + return kvm_arm_vcpu_finalize(vcpu, what); + } + default: + r = -EINVAL; + } + + return r; +} + +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + +} + +static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, + struct kvm_arm_device_addr *dev_addr) +{ + switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) { + case KVM_ARM_DEVICE_VGIC_V2: + if (!vgic_present) + return -ENXIO; + return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr); + default: + return -ENODEV; + } +} + +static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_ARM_VM_SMCCC_CTRL: + return kvm_vm_smccc_has_attr(kvm, attr); + default: + return -ENXIO; + } +} + +static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_ARM_VM_SMCCC_CTRL: + return kvm_vm_smccc_set_attr(kvm, attr); + default: + return -ENXIO; + } +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + struct kvm_device_attr attr; + + switch (ioctl) { + case KVM_CREATE_IRQCHIP: { + int ret; + if (!vgic_present) + return -ENXIO; + mutex_lock(&kvm->lock); + ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + mutex_unlock(&kvm->lock); + return ret; + } + case KVM_ARM_SET_DEVICE_ADDR: { + struct kvm_arm_device_addr dev_addr; + + if (copy_from_user(&dev_addr, argp, sizeof(dev_addr))) + return -EFAULT; + return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); + } + case KVM_ARM_PREFERRED_TARGET: { + struct kvm_vcpu_init init = { + .target = KVM_ARM_TARGET_GENERIC_V8, + }; + + if (copy_to_user(argp, &init, sizeof(init))) + return -EFAULT; + + return 0; + } + case KVM_ARM_MTE_COPY_TAGS: { + struct kvm_arm_copy_mte_tags copy_tags; + + if (copy_from_user(©_tags, argp, sizeof(copy_tags))) + return -EFAULT; + return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags); + } + case KVM_ARM_SET_COUNTER_OFFSET: { + struct kvm_arm_counter_offset offset; + + if (copy_from_user(&offset, argp, sizeof(offset))) + return -EFAULT; + return kvm_vm_ioctl_set_counter_offset(kvm, &offset); + } + case KVM_HAS_DEVICE_ATTR: { + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + return kvm_vm_has_attr(kvm, &attr); + } + case KVM_SET_DEVICE_ATTR: { + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + return kvm_vm_set_attr(kvm, &attr); + } + case KVM_ARM_GET_REG_WRITABLE_MASKS: { + struct reg_mask_range range; + + if (copy_from_user(&range, argp, sizeof(range))) + return -EFAULT; + return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range); + } + default: + return -EINVAL; + } +} + +static unsigned long nvhe_percpu_size(void) +{ + return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) - + (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start); +} + +static unsigned long nvhe_percpu_order(void) +{ + unsigned long size = nvhe_percpu_size(); + + return size ? get_order(size) : 0; +} + +static size_t pkvm_host_sve_state_order(void) +{ + return get_order(pkvm_host_sve_state_size()); +} + +/* A lookup table holding the hypervisor VA for each vector slot */ +static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; + +static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot) +{ + hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot); +} + +static int kvm_init_vector_slots(void) +{ + int err; + void *base; + + base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); + kvm_init_vector_slot(base, HYP_VECTOR_DIRECT); + + base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); + kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); + + if (kvm_system_needs_idmapped_vectors() && + !is_protected_kvm_enabled()) { + err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), + __BP_HARDEN_HYP_VECS_SZ, &base); + if (err) + return err; + } + + kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT); + kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT); + return 0; +} + +static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) +{ + struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); + unsigned long tcr; + + /* + * Calculate the raw per-cpu offset without a translation from the + * kernel's mapping to the linear mapping, and store it in tpidr_el2 + * so that we can use adr_l to access per-cpu variables in EL2. + * Also drop the KASAN tag which gets in the way... + */ + params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) - + (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); + + params->mair_el2 = read_sysreg(mair_el1); + + tcr = read_sysreg(tcr_el1); + if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK); + tcr |= TCR_EPD1_MASK; + } else { + unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr); + + tcr &= TCR_EL2_MASK; + tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips); + if (lpa2_is_enabled()) + tcr |= TCR_EL2_DS; + } + tcr |= TCR_T0SZ(hyp_va_bits); + params->tcr_el2 = tcr; + + params->pgd_pa = kvm_mmu_get_httbr(); + if (is_protected_kvm_enabled()) + params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; + else + params->hcr_el2 = HCR_HOST_NVHE_FLAGS; + if (cpus_have_final_cap(ARM64_KVM_HVHE)) + params->hcr_el2 |= HCR_E2H; + params->vttbr = params->vtcr = 0; + + /* + * Flush the init params from the data cache because the struct will + * be read while the MMU is off. + */ + kvm_flush_dcache_to_poc(params, sizeof(*params)); +} + +static void hyp_install_host_vector(void) +{ + struct kvm_nvhe_init_params *params; + struct arm_smccc_res res; + + /* Switch from the HYP stub to our own HYP init vector */ + __hyp_set_vectors(kvm_get_idmap_vector()); + + /* + * Call initialization code, and switch to the full blown HYP code. + * If the cpucaps haven't been finalized yet, something has gone very + * wrong, and hyp will crash and burn when it uses any + * cpus_have_*_cap() wrapper. + */ + BUG_ON(!system_capabilities_finalized()); + params = this_cpu_ptr_nvhe_sym(kvm_init_params); + arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res); + WARN_ON(res.a0 != SMCCC_RET_SUCCESS); +} + +static void cpu_init_hyp_mode(void) +{ + hyp_install_host_vector(); + + /* + * Disabling SSBD on a non-VHE system requires us to enable SSBS + * at EL2. + */ + if (this_cpu_has_cap(ARM64_SSBS) && + arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) { + kvm_call_hyp_nvhe(__kvm_enable_ssbs); + } +} + +static void cpu_hyp_reset(void) +{ + if (!is_kernel_in_hyp_mode()) + __hyp_reset_vectors(); +} + +/* + * EL2 vectors can be mapped and rerouted in a number of ways, + * depending on the kernel configuration and CPU present: + * + * - If the CPU is affected by Spectre-v2, the hardening sequence is + * placed in one of the vector slots, which is executed before jumping + * to the real vectors. + * + * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot + * containing the hardening sequence is mapped next to the idmap page, + * and executed before jumping to the real vectors. + * + * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an + * empty slot is selected, mapped next to the idmap page, and + * executed before jumping to the real vectors. + * + * Note that ARM64_SPECTRE_V3A is somewhat incompatible with + * VHE, as we don't have hypervisor-specific mappings. If the system + * is VHE and yet selects this capability, it will be ignored. + */ +static void cpu_set_hyp_vector(void) +{ + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + void *vector = hyp_spectre_vector_selector[data->slot]; + + if (!is_protected_kvm_enabled()) + *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; + else + kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); +} + +static void cpu_hyp_init_context(void) +{ + kvm_init_host_cpu_context(host_data_ptr(host_ctxt)); + kvm_init_host_debug_data(); + + if (!is_kernel_in_hyp_mode()) + cpu_init_hyp_mode(); +} + +static void cpu_hyp_init_features(void) +{ + cpu_set_hyp_vector(); + + if (is_kernel_in_hyp_mode()) { + kvm_timer_init_vhe(); + kvm_debug_init_vhe(); + } + + if (vgic_present) + kvm_vgic_init_cpu_hardware(); +} + +static void cpu_hyp_reinit(void) +{ + cpu_hyp_reset(); + cpu_hyp_init_context(); + cpu_hyp_init_features(); +} + +static void cpu_hyp_init(void *discard) +{ + if (!__this_cpu_read(kvm_hyp_initialized)) { + cpu_hyp_reinit(); + __this_cpu_write(kvm_hyp_initialized, 1); + } +} + +static void cpu_hyp_uninit(void *discard) +{ + if (!is_protected_kvm_enabled() && __this_cpu_read(kvm_hyp_initialized)) { + cpu_hyp_reset(); + __this_cpu_write(kvm_hyp_initialized, 0); + } +} + +int kvm_arch_enable_virtualization_cpu(void) +{ + /* + * Most calls to this function are made with migration + * disabled, but not with preemption disabled. The former is + * enough to ensure correctness, but most of the helpers + * expect the later and will throw a tantrum otherwise. + */ + preempt_disable(); + + cpu_hyp_init(NULL); + + kvm_vgic_cpu_up(); + kvm_timer_cpu_up(); + + preempt_enable(); + + return 0; +} + +void kvm_arch_disable_virtualization_cpu(void) +{ + kvm_timer_cpu_down(); + kvm_vgic_cpu_down(); + + if (!is_protected_kvm_enabled()) + cpu_hyp_uninit(NULL); +} + +#ifdef CONFIG_CPU_PM +static int hyp_init_cpu_pm_notifier(struct notifier_block *self, + unsigned long cmd, + void *v) +{ + /* + * kvm_hyp_initialized is left with its old value over + * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should + * re-enable hyp. + */ + switch (cmd) { + case CPU_PM_ENTER: + if (__this_cpu_read(kvm_hyp_initialized)) + /* + * don't update kvm_hyp_initialized here + * so that the hyp will be re-enabled + * when we resume. See below. + */ + cpu_hyp_reset(); + + return NOTIFY_OK; + case CPU_PM_ENTER_FAILED: + case CPU_PM_EXIT: + if (__this_cpu_read(kvm_hyp_initialized)) + /* The hyp was enabled before suspend. */ + cpu_hyp_reinit(); + + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block hyp_init_cpu_pm_nb = { + .notifier_call = hyp_init_cpu_pm_notifier, +}; + +static void __init hyp_cpu_pm_init(void) +{ + if (!is_protected_kvm_enabled()) + cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); +} +static void __init hyp_cpu_pm_exit(void) +{ + if (!is_protected_kvm_enabled()) + cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); +} +#else +static inline void __init hyp_cpu_pm_init(void) +{ +} +static inline void __init hyp_cpu_pm_exit(void) +{ +} +#endif + +static void __init init_cpu_logical_map(void) +{ + unsigned int cpu; + + /* + * Copy the MPIDR <-> logical CPU ID mapping to hyp. + * Only copy the set of online CPUs whose features have been checked + * against the finalized system capabilities. The hypervisor will not + * allow any other CPUs from the `possible` set to boot. + */ + for_each_online_cpu(cpu) + hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu); +} + +#define init_psci_0_1_impl_state(config, what) \ + config.psci_0_1_ ## what ## _implemented = psci_ops.what + +static bool __init init_psci_relay(void) +{ + /* + * If PSCI has not been initialized, protected KVM cannot install + * itself on newly booted CPUs. + */ + if (!psci_ops.get_version) { + kvm_err("Cannot initialize protected mode without PSCI\n"); + return false; + } + + kvm_host_psci_config.version = psci_ops.get_version(); + kvm_host_psci_config.smccc_version = arm_smccc_get_version(); + + if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { + kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); + init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend); + init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on); + init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off); + init_psci_0_1_impl_state(kvm_host_psci_config, migrate); + } + return true; +} + +static int __init init_subsystems(void) +{ + int err = 0; + + /* + * Enable hardware so that subsystem initialisation can access EL2. + */ + on_each_cpu(cpu_hyp_init, NULL, 1); + + /* + * Register CPU lower-power notifier + */ + hyp_cpu_pm_init(); + + /* + * Init HYP view of VGIC + */ + err = kvm_vgic_hyp_init(); + switch (err) { + case 0: + vgic_present = true; + break; + case -ENODEV: + case -ENXIO: + /* + * No VGIC? No pKVM for you. + * + * Protected mode assumes that VGICv3 is present, so no point + * in trying to hobble along if vgic initialization fails. + */ + if (is_protected_kvm_enabled()) + goto out; + + /* + * Otherwise, userspace could choose to implement a GIC for its + * guest on non-cooperative hardware. + */ + vgic_present = false; + err = 0; + break; + default: + goto out; + } + + if (kvm_mode == KVM_MODE_NV && + !(vgic_present && (kvm_vgic_global_state.type == VGIC_V3 || + kvm_vgic_global_state.has_gcie_v3_compat))) { + kvm_err("NV support requires GICv3 or GICv5 with legacy support, giving up\n"); + err = -EINVAL; + goto out; + } + + /* + * Init HYP architected timer support + */ + err = kvm_timer_hyp_init(vgic_present); + if (err) + goto out; + + kvm_register_perf_callbacks(NULL); + +out: + if (err) + hyp_cpu_pm_exit(); + + if (err || !is_protected_kvm_enabled()) + on_each_cpu(cpu_hyp_uninit, NULL, 1); + + return err; +} + +static void __init teardown_subsystems(void) +{ + kvm_unregister_perf_callbacks(); + hyp_cpu_pm_exit(); +} + +static void __init teardown_hyp_mode(void) +{ + bool free_sve = system_supports_sve() && is_protected_kvm_enabled(); + int cpu; + + free_hyp_pgds(); + for_each_possible_cpu(cpu) { + if (per_cpu(kvm_hyp_initialized, cpu)) + continue; + + free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT); + + if (!kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]) + continue; + + if (free_sve) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + free_pages((unsigned long) sve_state, pkvm_host_sve_state_order()); + } + + free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); + + } +} + +static int __init do_pkvm_init(u32 hyp_va_bits) +{ + void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); + int ret; + + preempt_disable(); + cpu_hyp_init_context(); + ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, + num_possible_cpus(), kern_hyp_va(per_cpu_base), + hyp_va_bits); + cpu_hyp_init_features(); + + /* + * The stub hypercalls are now disabled, so set our local flag to + * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu(). + */ + __this_cpu_write(kvm_hyp_initialized, 1); + preempt_enable(); + + return ret; +} + +static u64 get_hyp_id_aa64pfr0_el1(void) +{ + /* + * Track whether the system isn't affected by spectre/meltdown in the + * hypervisor's view of id_aa64pfr0_el1, used for protected VMs. + * Although this is per-CPU, we make it global for simplicity, e.g., not + * to have to worry about vcpu migration. + * + * Unlike for non-protected VMs, userspace cannot override this for + * protected VMs. + */ + u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + val &= ~(ID_AA64PFR0_EL1_CSV2 | + ID_AA64PFR0_EL1_CSV3); + + val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV2, + arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED); + val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV3, + arm64_get_meltdown_state() == SPECTRE_UNAFFECTED); + + return val; +} + +static void kvm_hyp_init_symbols(void) +{ + kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1(); + kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); + kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); + kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); + kvm_nvhe_sym(__icache_flags) = __icache_flags; + kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; + + /* Propagate the FGT state to the nVHE side */ + kvm_nvhe_sym(hfgrtr_masks) = hfgrtr_masks; + kvm_nvhe_sym(hfgwtr_masks) = hfgwtr_masks; + kvm_nvhe_sym(hfgitr_masks) = hfgitr_masks; + kvm_nvhe_sym(hdfgrtr_masks) = hdfgrtr_masks; + kvm_nvhe_sym(hdfgwtr_masks) = hdfgwtr_masks; + kvm_nvhe_sym(hafgrtr_masks) = hafgrtr_masks; + kvm_nvhe_sym(hfgrtr2_masks) = hfgrtr2_masks; + kvm_nvhe_sym(hfgwtr2_masks) = hfgwtr2_masks; + kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks; + kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks; + kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks; + + /* + * Flush entire BSS since part of its data containing init symbols is read + * while the MMU is off. + */ + kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start), + kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start)); +} + +static int __init kvm_hyp_init_protection(u32 hyp_va_bits) +{ + void *addr = phys_to_virt(hyp_mem_base); + int ret; + + ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); + if (ret) + return ret; + + ret = do_pkvm_init(hyp_va_bits); + if (ret) + return ret; + + free_hyp_pgds(); + + return 0; +} + +static int init_pkvm_host_sve_state(void) +{ + int cpu; + + if (!system_supports_sve()) + return 0; + + /* Allocate pages for host sve state in protected mode. */ + for_each_possible_cpu(cpu) { + struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order()); + + if (!page) + return -ENOMEM; + + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page); + } + + /* + * Don't map the pages in hyp since these are only used in protected + * mode, which will (re)create its own mapping when initialized. + */ + + return 0; +} + +/* + * Finalizes the initialization of hyp mode, once everything else is initialized + * and the initialziation process cannot fail. + */ +static void finalize_init_hyp_mode(void) +{ + int cpu; + + if (system_supports_sve() && is_protected_kvm_enabled()) { + for_each_possible_cpu(cpu) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = + kern_hyp_va(sve_state); + } + } +} + +static void pkvm_hyp_init_ptrauth(void) +{ + struct kvm_cpu_context *hyp_ctxt; + int cpu; + + for_each_possible_cpu(cpu) { + hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu); + hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long(); + } +} + +/* Inits Hyp-mode on all online CPUs */ +static int __init init_hyp_mode(void) +{ + u32 hyp_va_bits; + int cpu; + int err = -ENOMEM; + + /* + * The protected Hyp-mode cannot be initialized if the memory pool + * allocation has failed. + */ + if (is_protected_kvm_enabled() && !hyp_mem_base) + goto out_err; + + /* + * Allocate Hyp PGD and setup Hyp identity mapping + */ + err = kvm_mmu_init(&hyp_va_bits); + if (err) + goto out_err; + + /* + * Allocate stack pages for Hypervisor-mode + */ + for_each_possible_cpu(cpu) { + unsigned long stack_base; + + stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT); + if (!stack_base) { + err = -ENOMEM; + goto out_err; + } + + per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base; + } + + /* + * Allocate and initialize pages for Hypervisor-mode percpu regions. + */ + for_each_possible_cpu(cpu) { + struct page *page; + void *page_addr; + + page = alloc_pages(GFP_KERNEL, nvhe_percpu_order()); + if (!page) { + err = -ENOMEM; + goto out_err; + } + + page_addr = page_address(page); + memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); + kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr; + } + + /* + * Map the Hyp-code called directly from the host + */ + err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), + kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); + if (err) { + kvm_err("Cannot map world-switch code\n"); + goto out_err; + } + + err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start), + kvm_ksym_ref(__hyp_data_end), PAGE_HYP); + if (err) { + kvm_err("Cannot map .hyp.data section\n"); + goto out_err; + } + + err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start), + kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO); + if (err) { + kvm_err("Cannot map .hyp.rodata section\n"); + goto out_err; + } + + err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), + kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); + if (err) { + kvm_err("Cannot map rodata section\n"); + goto out_err; + } + + /* + * .hyp.bss is guaranteed to be placed at the beginning of the .bss + * section thanks to an assertion in the linker script. Map it RW and + * the rest of .bss RO. + */ + err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start), + kvm_ksym_ref(__hyp_bss_end), PAGE_HYP); + if (err) { + kvm_err("Cannot map hyp bss section: %d\n", err); + goto out_err; + } + + err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end), + kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); + if (err) { + kvm_err("Cannot map bss section\n"); + goto out_err; + } + + /* + * Map the Hyp stack pages + */ + for_each_possible_cpu(cpu) { + struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); + char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu); + + err = create_hyp_stack(__pa(stack_base), ¶ms->stack_hyp_va); + if (err) { + kvm_err("Cannot map hyp stack\n"); + goto out_err; + } + + /* + * Save the stack PA in nvhe_init_params. This will be needed + * to recreate the stack mapping in protected nVHE mode. + * __hyp_pa() won't do the right thing there, since the stack + * has been mapped in the flexible private VA space. + */ + params->stack_pa = __pa(stack_base); + } + + for_each_possible_cpu(cpu) { + char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; + char *percpu_end = percpu_begin + nvhe_percpu_size(); + + /* Map Hyp percpu pages */ + err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); + if (err) { + kvm_err("Cannot map hyp percpu region\n"); + goto out_err; + } + + /* Prepare the CPU initialization parameters */ + cpu_prepare_hyp_mode(cpu, hyp_va_bits); + } + + kvm_hyp_init_symbols(); + + if (is_protected_kvm_enabled()) { + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && + cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH)) + pkvm_hyp_init_ptrauth(); + + init_cpu_logical_map(); + + if (!init_psci_relay()) { + err = -ENODEV; + goto out_err; + } + + err = init_pkvm_host_sve_state(); + if (err) + goto out_err; + + err = kvm_hyp_init_protection(hyp_va_bits); + if (err) { + kvm_err("Failed to init hyp memory protection\n"); + goto out_err; + } + } + + return 0; + +out_err: + teardown_hyp_mode(); + kvm_err("error initializing Hyp mode: %d\n", err); + return err; +} + +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) +{ + struct kvm_vcpu *vcpu = NULL; + struct kvm_mpidr_data *data; + unsigned long i; + + mpidr &= MPIDR_HWID_BITMASK; + + rcu_read_lock(); + data = rcu_dereference(kvm->arch.mpidr_data); + + if (data) { + u16 idx = kvm_mpidr_index(data, mpidr); + + vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]); + if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu)) + vcpu = NULL; + } + + rcu_read_unlock(); + + if (vcpu) + return vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) + return vcpu; + } + return NULL; +} + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; + + /* + * The only thing we have a chance of directly-injecting is LPIs. Maybe + * one day... + */ + if (irq_entry->type != KVM_IRQ_ROUTING_MSI) + return 0; + + return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, + &irqfd->irq_entry); +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; + + if (irq_entry->type != KVM_IRQ_ROUTING_MSI) + return; + + kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq); +} + +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + if (old->type == KVM_IRQ_ROUTING_MSI && + new->type == KVM_IRQ_ROUTING_MSI && + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) + return; + + /* + * Remapping the vLPI requires taking the its_lock mutex to resolve + * the new translation. We're in spinlock land at this point, so no + * chance of resolving the translation. + * + * Unmap the vLPI and fall back to software LPI injection. + */ + return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq); +} + +void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_arm_halt_guest(irqfd->kvm); +} + +void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_arm_resume_guest(irqfd->kvm); +} + +/* Initialize Hyp-mode and memory mappings on all CPUs */ +static __init int kvm_arm_init(void) +{ + int err; + bool in_hyp_mode; + + if (!is_hyp_mode_available()) { + kvm_info("HYP mode not available\n"); + return -ENODEV; + } + + if (kvm_get_mode() == KVM_MODE_NONE) { + kvm_info("KVM disabled from command line\n"); + return -ENODEV; + } + + err = kvm_sys_reg_table_init(); + if (err) { + kvm_info("Error initializing system register tables"); + return err; + } + + in_hyp_mode = is_kernel_in_hyp_mode(); + + if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || + cpus_have_final_cap(ARM64_WORKAROUND_1508412)) + kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ + "Only trusted guests should be used on this system.\n"); + + err = kvm_set_ipa_limit(); + if (err) + return err; + + err = kvm_arm_init_sve(); + if (err) + return err; + + err = kvm_arm_vmid_alloc_init(); + if (err) { + kvm_err("Failed to initialize VMID allocator.\n"); + return err; + } + + if (!in_hyp_mode) { + err = init_hyp_mode(); + if (err) + goto out_err; + } + + err = kvm_init_vector_slots(); + if (err) { + kvm_err("Cannot initialise vector slots\n"); + goto out_hyp; + } + + err = init_subsystems(); + if (err) + goto out_hyp; + + kvm_info("%s%sVHE%s mode initialized successfully\n", + in_hyp_mode ? "" : (is_protected_kvm_enabled() ? + "Protected " : "Hyp "), + in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ? + "h" : "n"), + cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": ""); + + /* + * FIXME: Do something reasonable if kvm_init() fails after pKVM + * hypervisor protection is finalized. + */ + err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (err) + goto out_subs; + + /* + * This should be called after initialization is done and failure isn't + * possible anymore. + */ + if (!in_hyp_mode) + finalize_init_hyp_mode(); + + kvm_arm_initialised = true; + + return 0; + +out_subs: + teardown_subsystems(); +out_hyp: + if (!in_hyp_mode) + teardown_hyp_mode(); +out_err: + kvm_arm_vmid_alloc_free(); + return err; +} + +static int __init early_kvm_mode_cfg(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "none") == 0) { + kvm_mode = KVM_MODE_NONE; + return 0; + } + + if (!is_hyp_mode_available()) { + pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n"); + return 0; + } + + if (strcmp(arg, "protected") == 0) { + if (!is_kernel_in_hyp_mode()) + kvm_mode = KVM_MODE_PROTECTED; + else + pr_warn_once("Protected KVM not available with VHE\n"); + + return 0; + } + + if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) { + kvm_mode = KVM_MODE_DEFAULT; + return 0; + } + + if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) { + kvm_mode = KVM_MODE_NV; + return 0; + } + + return -EINVAL; +} +early_param("kvm-arm.mode", early_kvm_mode_cfg); + +static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "trap") == 0) { + *p = KVM_WFX_TRAP; + return 0; + } + + if (strcmp(arg, "notrap") == 0) { + *p = KVM_WFX_NOTRAP; + return 0; + } + + return -EINVAL; +} + +static int __init early_kvm_wfi_trap_policy_cfg(char *arg) +{ + return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy); +} +early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg); + +static int __init early_kvm_wfe_trap_policy_cfg(char *arg) +{ + return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy); +} +early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg); + +enum kvm_mode kvm_get_mode(void) +{ + return kvm_mode; +} + +module_init(kvm_arm_init); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c new file mode 100644 index 000000000000..53bf70126f81 --- /dev/null +++ b/arch/arm64/kvm/at.c @@ -0,0 +1,1795 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Linaro Ltd + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm_host.h> + +#include <asm/esr.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) +{ + wr->fst = fst; + wr->ptw = s1ptw; + wr->s2 = s1ptw; + wr->failed = true; +} + +#define S1_MMU_DISABLED (-127) + +static int get_ia_size(struct s1_walk_info *wi) +{ + return 64 - wi->txsz; +} + +/* Return true if the IPA is out of the OA range */ +static bool check_output_size(u64 ipa, struct s1_walk_info *wi) +{ + if (wi->pa52bit) + return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); + return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); +} + +static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) +{ + switch (BIT(wi->pgshift)) { + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) + return false; + return ((wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : + FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); + case SZ_16K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) + return false; + break; + case SZ_4K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) + return false; + break; + } + + return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); +} + +static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) +{ + u64 addr; + + if (!wi->pa52bit) + return desc & GENMASK_ULL(47, wi->pgshift); + + switch (BIT(wi->pgshift)) { + case SZ_4K: + case SZ_16K: + addr = desc & GENMASK_ULL(49, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; + break; + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + addr = desc & GENMASK_ULL(47, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; + break; + } + + return addr; +} + +/* Return the translation regime that applies to an AT instruction */ +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) +{ + /* + * We only get here from guest EL2, so the translation + * regime AT applies to is solely defined by {E2H,TGE}. + */ + switch (op) { + case OP_AT_S1E2R: + case OP_AT_S1E2W: + case OP_AT_S1E2A: + return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + default: + return (vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; + } +} + +static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (regime == TR_EL10) { + if (vcpu_has_nv(vcpu) && + !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) + return 0; + + return vcpu_read_sys_reg(vcpu, TCR2_EL1); + } + + return vcpu_read_sys_reg(vcpu, TCR2_EL2); +} + +static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (!kvm_has_s1pie(vcpu->kvm)) + return false; + + /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ + return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; +} + +static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) +{ + u64 val; + + if (!kvm_has_s1poe(vcpu->kvm)) { + wi->poe = wi->e0poe = false; + return; + } + + val = effective_tcr2(vcpu, wi->regime); + + /* Abuse TCR2_EL1_* for EL2 */ + wi->poe = val & TCR2_EL1_POE; + wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); +} + +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; + unsigned int stride, x; + bool va55, tbi, lva; + + va55 = va & BIT(55); + + if (vcpu_has_nv(vcpu)) { + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + } else { + WARN_ON_ONCE(wi->regime != TR_EL10); + wi->s2 = false; + hcr = 0; + } + + switch (wi->regime) { + case TR_EL10: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL2: + case TR_EL20: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55 && wi->regime != TR_EL2) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); + + ia_bits = get_ia_size(wi); + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + + tbi = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_TBI, tcr) : + (va55 ? + FIELD_GET(TCR_TBI1, tcr) : + FIELD_GET(TCR_TBI0, tcr))); + + if (!tbi && (u64)sign_extend64(va, 55) != va) + goto addrsz; + + wi->sh = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_SH0_MASK, tcr) : + (va55 ? + FIELD_GET(TCR_SH1_MASK, tcr) : + FIELD_GET(TCR_SH0_MASK, tcr))); + + va = (u64)sign_extend64(va, 55); + + /* Let's put the MMU disabled case aside immediately */ + switch (wi->regime) { + case TR_EL10: + /* + * If dealing with the EL1&0 translation regime, 3 things + * can disable the S1 translation: + * + * - HCR_EL2.DC = 1 + * - HCR_EL2.{E2H,TGE} = {0,1} + * - SCTLR_EL1.M = 0 + * + * The TGE part is interesting. If we have decided that this + * is EL1&0, then it means that either {E2H,TGE} == {1,0} or + * {0,x}, and we only need to test for TGE == 1. + */ + if (hcr & (HCR_DC | HCR_TGE)) { + wr->level = S1_MMU_DISABLED; + break; + } + fallthrough; + case TR_EL2: + case TR_EL20: + if (!(sctlr & SCTLR_ELx_M)) + wr->level = S1_MMU_DISABLED; + break; + } + + if (wr->level == S1_MMU_DISABLED) { + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) + goto addrsz; + + wr->pa = va; + return 0; + } + + wi->be = sctlr & SCTLR_ELx_EE; + + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); + wi->hpd &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HPD, tcr) : + (va55 ? + FIELD_GET(TCR_HPD1, tcr) : + FIELD_GET(TCR_HPD0, tcr))); + /* R_JHSVW */ + wi->hpd |= s1pie_enabled(vcpu, wi->regime); + + /* Do we have POE? */ + compute_s1poe(vcpu, wi); + + /* R_BVXDG */ + wi->hpd |= (wi->poe || wi->e0poe); + + /* R_PLCGL, R_YXNYW */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { + if (wi->txsz > 39) + goto transfault; + } else { + if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) + goto transfault; + } + + /* R_GTJBY, R_SXWGM */ + switch (BIT(wi->pgshift)) { + case SZ_4K: + case SZ_16K: + lva = wi->pa52bit; + break; + case SZ_64K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); + break; + } + + if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) + goto transfault; + + /* R_YYVYV, I_THCZK */ + if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || + (va55 && va < GENMASK(63, ia_bits))) + goto transfault; + + /* I_ZFSYQ */ + if (wi->regime != TR_EL2 && + (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) + goto transfault; + + /* R_BNDVG and following statements */ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && + wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + goto transfault; + + ps = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); + + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); + + /* Compute minimal alignment */ + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); + + wi->baddr = ttbr & TTBRx_EL1_BADDR; + if (wi->pa52bit) { + /* + * Force the alignment on 64 bytes for top-level tables + * smaller than 8 entries, since TTBR.BADDR[5:2] are used to + * store bits [51:48] of the first level of lookup. + */ + x = max(x, 6); + + wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; + } + + /* R_VPBBF */ + if (check_output_size(wi->baddr, wi)) + goto addrsz; + + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + + wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); + wi->ha &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HA, tcr) : + FIELD_GET(TCR_HA, tcr)); + + return 0; + +addrsz: + /* + * Address Size Fault level 0 to indicate it comes from TTBR. + * yes, this is an oddity. + */ + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); + return -EFAULT; + +transfault: + /* Translation Fault on start level */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); + return -EFAULT; +} + +static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, + struct s1_walk_info *wi) +{ + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; +} + +static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, + struct s1_walk_info *wi) +{ + if (wi->be) { + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); + } else { + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 va_top, va_bottom, baddr, desc, new_desc, ipa; + int level, stride, ret; + + level = wi->sl; + stride = wi->pgshift - 3; + baddr = wi->baddr; + + va_top = get_ia_size(wi) - 1; + + while (1) { + u64 index; + + va_bottom = (3 - level) * stride + wi->pgshift; + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); + + ipa = baddr | index; + + if (wi->s2) { + struct kvm_s2_trans s2_trans = {}; + + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); + if (ret) { + fail_s1_walk(wr, + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, + true); + return ret; + } + + if (!kvm_s2_trans_readable(&s2_trans)) { + fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), + true); + + return -EPERM; + } + + ipa = kvm_s2_trans_output(&s2_trans); + } + + if (wi->filter) { + ret = wi->filter->fn(&(struct s1_walk_context) + { + .wi = wi, + .table_ipa = baddr, + .level = level, + }, wi->filter->priv); + if (ret) + return ret; + } + + ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); + if (ret) { + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); + return ret; + } + + new_desc = desc; + + /* Invalid descriptor */ + if (!(desc & BIT(0))) + goto transfault; + + /* Block mapping, check validity down the line */ + if (!(desc & BIT(1))) + break; + + /* Page mapping */ + if (level == 3) + break; + + /* Table handling */ + if (!wi->hpd) { + wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); + } + + baddr = desc_to_oa(wi, desc); + + /* Check for out-of-range OA */ + if (check_output_size(baddr, wi)) + goto addrsz; + + /* Prepare for next round */ + va_top = va_bottom - 1; + level++; + } + + /* Block mapping, check the validity of the level */ + if (!(desc & BIT(1))) { + bool valid_block = false; + + switch (BIT(wi->pgshift)) { + case SZ_4K: + valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); + break; + case SZ_16K: + case SZ_64K: + valid_block = level == 2 || (wi->pa52bit && level == 1); + break; + } + + if (!valid_block) + goto transfault; + } + + baddr = desc_to_oa(wi, desc); + if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) + goto addrsz; + + if (wi->ha) + new_desc |= PTE_AF; + + if (new_desc != desc) { + ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + + if (!(desc & PTE_AF)) { + fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); + return -EACCES; + } + + va_bottom += contiguous_bit_shift(desc, wi, level); + + wr->failed = false; + wr->level = level; + wr->desc = desc; + wr->pa = baddr & GENMASK(52, va_bottom); + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + + wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); + if (wr->nG) { + u64 asid_ttbr, tcr; + + switch (wi->regime) { + case TR_EL10: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL20: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + wr->asid &= GENMASK(7, 0); + } + + return 0; + +addrsz: + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false); + return -EINVAL; +transfault: + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false); + return -ENOENT; +} + +struct mmu_config { + u64 ttbr0; + u64 ttbr1; + u64 tcr; + u64 mair; + u64 tcr2; + u64 pir; + u64 pire0; + u64 por_el0; + u64 por_el1; + u64 sctlr; + u64 vttbr; + u64 vtcr; +}; + +static void __mmu_config_save(struct mmu_config *config) +{ + config->ttbr0 = read_sysreg_el1(SYS_TTBR0); + config->ttbr1 = read_sysreg_el1(SYS_TTBR1); + config->tcr = read_sysreg_el1(SYS_TCR); + config->mair = read_sysreg_el1(SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + config->tcr2 = read_sysreg_el1(SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + config->pir = read_sysreg_el1(SYS_PIR); + config->pire0 = read_sysreg_el1(SYS_PIRE0); + } + if (system_supports_poe()) { + config->por_el1 = read_sysreg_el1(SYS_POR); + config->por_el0 = read_sysreg_s(SYS_POR_EL0); + } + } + config->sctlr = read_sysreg_el1(SYS_SCTLR); + config->vttbr = read_sysreg(vttbr_el2); + config->vtcr = read_sysreg(vtcr_el2); +} + +static void __mmu_config_restore(struct mmu_config *config) +{ + /* + * ARM errata 1165522 and 1530923 require TGE to be 1 before + * we update the guest state. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + + write_sysreg_el1(config->ttbr0, SYS_TTBR0); + write_sysreg_el1(config->ttbr1, SYS_TTBR1); + write_sysreg_el1(config->tcr, SYS_TCR); + write_sysreg_el1(config->mair, SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + write_sysreg_el1(config->tcr2, SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + write_sysreg_el1(config->pir, SYS_PIR); + write_sysreg_el1(config->pire0, SYS_PIRE0); + } + if (system_supports_poe()) { + write_sysreg_el1(config->por_el1, SYS_POR); + write_sysreg_s(config->por_el0, SYS_POR_EL0); + } + } + write_sysreg_el1(config->sctlr, SYS_SCTLR); + write_sysreg(config->vttbr, vttbr_el2); + write_sysreg(config->vtcr, vtcr_el2); +} + +static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 host_pan; + bool fail; + + host_pan = read_sysreg_s(SYS_PSTATE_PAN); + write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); + + switch (op) { + case OP_AT_S1E1RP: + fail = __kvm_at(OP_AT_S1E1RP, vaddr); + break; + case OP_AT_S1E1WP: + fail = __kvm_at(OP_AT_S1E1WP, vaddr); + break; + } + + write_sysreg_s(host_pan, SYS_PSTATE_PAN); + + return fail; +} + +#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) +#define MEMATTR_NC 0b0100 +#define MEMATTR_Wt 0b1000 +#define MEMATTR_Wb 0b1100 +#define MEMATTR_WbRaWa 0b1111 + +#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) + +static u8 s2_memattr_to_attr(u8 memattr) +{ + memattr &= 0b1111; + + switch (memattr) { + case 0b0000: + case 0b0001: + case 0b0010: + case 0b0011: + return memattr << 2; + case 0b0100: + return MEMATTR(Wb, Wb); + case 0b0101: + return MEMATTR(NC, NC); + case 0b0110: + return MEMATTR(Wt, NC); + case 0b0111: + return MEMATTR(Wb, NC); + case 0b1000: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1001: + return MEMATTR(NC, Wt); + case 0b1010: + return MEMATTR(Wt, Wt); + case 0b1011: + return MEMATTR(Wb, Wt); + case 0b1100: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1101: + return MEMATTR(NC, Wb); + case 0b1110: + return MEMATTR(Wt, Wb); + case 0b1111: + return MEMATTR(Wb, Wb); + default: + unreachable(); + } +} + +static u8 combine_s1_s2_attr(u8 s1, u8 s2) +{ + bool transient; + u8 final = 0; + + /* Upgrade transient s1 to non-transient to simplify things */ + switch (s1) { + case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ + transient = true; + s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); + break; + case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ + transient = true; + s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); + break; + default: + transient = false; + } + + /* S2CombineS1AttrHints() */ + if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || + (s2 & GENMASK(3, 2)) == MEMATTR_NC) + final = MEMATTR_NC; + else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || + (s2 & GENMASK(3, 2)) == MEMATTR_Wt) + final = MEMATTR_Wt; + else + final = MEMATTR_Wb; + + if (final != MEMATTR_NC) { + /* Inherit RaWa hints form S1 */ + if (transient) { + switch (s1 & GENMASK(3, 2)) { + case MEMATTR_Wt: + final = 0; + break; + case MEMATTR_Wb: + final = MEMATTR_NC; + break; + } + } + + final |= s1 & GENMASK(1, 0); + } + + return final; +} + +#define ATTR_NSH 0b00 +#define ATTR_RSV 0b01 +#define ATTR_OSH 0b10 +#define ATTR_ISH 0b11 + +static u8 compute_final_sh(u8 attr, u8 sh) +{ + /* Any form of device, as well as NC has SH[1:0]=0b10 */ + if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) + return ATTR_OSH; + + if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ + sh = ATTR_NSH; + + return sh; +} + +static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, + u8 attr) +{ + u8 sh; + + /* + * non-52bit and LPA have their basic shareability described in the + * descriptor. LPA2 gets it from the corresponding field in TCR, + * conveniently recorded in the walk info. + */ + if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) + sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); + else + sh = wi->sh; + + return compute_final_sh(attr, sh); +} + +static u8 combine_sh(u8 s1_sh, u8 s2_sh) +{ + if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) + return ATTR_OSH; + if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) + return ATTR_ISH; + + return ATTR_NSH; +} + +static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, + struct kvm_s2_trans *tr) +{ + u8 s1_parattr, s2_memattr, final_attr, s2_sh; + u64 par; + + /* If S2 has failed to translate, report the damage */ + if (tr->esr) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= SYS_PAR_EL1_S; + par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); + return par; + } + + s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); + s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { + if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) + s2_memattr &= ~BIT(3); + + /* Combination of R_VRJSW and R_RHWZM */ + switch (s2_memattr) { + case 0b0101: + if (MEMATTR_IS_DEVICE(s1_parattr)) + final_attr = s1_parattr; + else + final_attr = MEMATTR(NC, NC); + break; + case 0b0110: + case 0b1110: + final_attr = MEMATTR(WbRaWa, WbRaWa); + break; + case 0b0111: + case 0b1111: + /* Preserve S1 attribute */ + final_attr = s1_parattr; + break; + case 0b0100: + case 0b1100: + case 0b1101: + /* Reserved, do something non-silly */ + final_attr = s1_parattr; + break; + default: + /* + * MemAttr[2]=0, Device from S2. + * + * FWB does not influence the way that stage 1 + * memory types and attributes are combined + * with stage 2 Device type and attributes. + */ + final_attr = min(s2_memattr_to_attr(s2_memattr), + s1_parattr); + } + } else { + /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ + u8 s2_parattr = s2_memattr_to_attr(s2_memattr); + + if (MEMATTR_IS_DEVICE(s1_parattr) || + MEMATTR_IS_DEVICE(s2_parattr)) { + final_attr = min(s1_parattr, s2_parattr); + } else { + /* At this stage, this is memory vs memory */ + final_attr = combine_s1_s2_attr(s1_parattr & 0xf, + s2_parattr & 0xf); + final_attr |= combine_s1_s2_attr(s1_parattr >> 4, + s2_parattr >> 4) << 4; + } + } + + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && + !MEMATTR_IS_DEVICE(final_attr)) + final_attr = MEMATTR(NC, NC); + + s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); + + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); + par |= tr->output & GENMASK(47, 12); + par |= FIELD_PREP(SYS_PAR_EL1_SH, + combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), + compute_final_sh(final_attr, s2_sh))); + + return par; +} + +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u64 par; + + if (wr->failed) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; + par |= wr->s2 ? SYS_PAR_EL1_S : 0; + } else if (wr->level == S1_MMU_DISABLED) { + /* MMU off or HCR_EL2.DC == 1 */ + par = SYS_PAR_EL1_NSE; + par |= wr->pa & SYS_PAR_EL1_PA; + + if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, + MEMATTR(WbRaWa, WbRaWa)); + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); + } else { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); + } + } else { + u64 mair, sctlr; + u8 sh; + + par = SYS_PAR_EL1_NSE; + + mair = (wi->regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, MAIR_EL1) : + vcpu_read_sys_reg(vcpu, MAIR_EL2)); + + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; + mair &= 0xff; + + sctlr = (wi->regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, SCTLR_EL1) : + vcpu_read_sys_reg(vcpu, SCTLR_EL2)); + + /* Force NC for memory if SCTLR_ELx.C is clear */ + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) + mair = MEMATTR(NC, NC); + + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); + par |= wr->pa & SYS_PAR_EL1_PA; + + sh = compute_s1_sh(wi, wr, mair); + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); + } + + return par; +} + +static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + u64 sctlr; + + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + return false; + + if (s1pie_enabled(vcpu, regime)) + return true; + + if (regime == TR_EL10) + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + else + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + + return sctlr & SCTLR_EL1_EPAN; +} + +static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + bool wxn; + + /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { + case 0b00: + wr->pr = wr->pw = true; + wr->ur = wr->uw = false; + break; + case 0b01: + wr->pr = wr->pw = wr->ur = wr->uw = true; + break; + case 0b10: + wr->pr = true; + wr->pw = wr->ur = wr->uw = false; + break; + case 0b11: + wr->pr = wr->ur = true; + wr->pw = wr->uw = false; + break; + } + + /* We don't use px for anything yet, but hey... */ + wr->px = !((wr->desc & PTE_PXN) || wr->uw); + wr->ux = !(wr->desc & PTE_UXN); + } else { + wr->ur = wr->uw = wr->ux = false; + + if (!(wr->desc & PTE_RDONLY)) { + wr->pr = wr->pw = true; + } else { + wr->pr = true; + wr->pw = false; + } + + /* XN maps to UXN */ + wr->px = !(wr->desc & PTE_UXN); + } + + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); + break; + case TR_EL10: + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); + break; + } + + wr->pwxn = wr->uwxn = wxn; + wr->pov = wi->poe; + wr->uov = wi->e0poe; +} + +static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + /* Hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (wr->APTable) { + case 0b00: + break; + case 0b01: + wr->ur = wr->uw = false; + break; + case 0b10: + wr->pw = wr->uw = false; + break; + case 0b11: + wr->pw = wr->ur = wr->uw = false; + break; + } + + wr->px &= !wr->PXNTable; + wr->ux &= !wr->UXNTable; + } else { + if (wr->APTable & BIT(1)) + wr->pw = false; + + /* XN maps to UXN */ + wr->px &= !wr->UXNTable; + } +} + +#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) + +#define set_priv_perms(wr, r, w, x) \ + do { \ + (wr)->pr = (r); \ + (wr)->pw = (w); \ + (wr)->px = (x); \ + } while (0) + +#define set_unpriv_perms(wr, r, w, x) \ + do { \ + (wr)->ur = (r); \ + (wr)->uw = (w); \ + (wr)->ux = (x); \ + } while (0) + +#define set_priv_wxn(wr, v) \ + do { \ + (wr)->pwxn = (v); \ + } while (0) + +#define set_unpriv_wxn(wr, v) \ + do { \ + (wr)->uwxn = (v); \ + } while (0) + +/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ +#define set_perms(w, wr, ip) \ + do { \ + /* R_LLZDZ */ \ + switch ((ip)) { \ + case 0b0000: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b0010: \ + set_ ## w ## _perms((wr), false, false, true ); \ + break; \ + case 0b0011: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b0100: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0101: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b0110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b0111: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1000: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1010: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b1011: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1100: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b1101: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1111: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + } \ + \ + /* R_HJYGR */ \ + set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ + \ + } while (0) + +static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 up, pp, idx; + + idx = pte_pi_index(wr->desc); + + switch (wi->regime) { + case TR_EL10: + pp = perm_idx(vcpu, PIR_EL1, idx); + up = perm_idx(vcpu, PIRE0_EL1, idx); + break; + case TR_EL20: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = perm_idx(vcpu, PIRE0_EL2, idx); + break; + case TR_EL2: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = 0; + break; + } + + set_perms(priv, wr, pp); + + if (wi->regime != TR_EL2) + set_perms(unpriv, wr, up); + else + set_unpriv_perms(wr, false, false, false); + + wr->pov = wi->poe && !(pp & BIT(3)); + wr->uov = wi->e0poe && !(up & BIT(3)); + + /* R_VFPJF */ + if (wr->px && wr->uw) { + set_priv_perms(wr, false, false, false); + set_unpriv_perms(wr, false, false, false); + } +} + +static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 idx, pov_perms, uov_perms; + + idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); + + if (wr->pov) { + switch (wi->regime) { + case TR_EL10: + pov_perms = perm_idx(vcpu, POR_EL1, idx); + break; + case TR_EL20: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + break; + case TR_EL2: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + break; + } + + if (pov_perms & ~POE_RWX) + pov_perms = POE_NONE; + + /* R_QXXPC, S1PrivOverflow enabled */ + if (wr->pwxn && (pov_perms & POE_X)) + pov_perms &= ~POE_W; + + wr->pr &= pov_perms & POE_R; + wr->pw &= pov_perms & POE_W; + wr->px &= pov_perms & POE_X; + } + + if (wr->uov) { + switch (wi->regime) { + case TR_EL10: + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL20: + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL2: + uov_perms = 0; + break; + } + + if (uov_perms & ~POE_RWX) + uov_perms = POE_NONE; + + /* R_NPBXC, S1UnprivOverlay enabled */ + if (wr->uwxn && (uov_perms & POE_X)) + uov_perms &= ~POE_W; + + wr->ur &= uov_perms & POE_R; + wr->uw &= uov_perms & POE_W; + wr->ux &= uov_perms & POE_X; + } +} + +static void compute_s1_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + bool pan; + + if (!s1pie_enabled(vcpu, wi->regime)) + compute_s1_direct_permissions(vcpu, wi, wr); + else + compute_s1_indirect_permissions(vcpu, wi, wr); + + if (!wi->hpd) + compute_s1_hierarchical_permissions(vcpu, wi, wr); + + compute_s1_overlay_permissions(vcpu, wi, wr); + + /* R_QXXPC, S1PrivOverlay disabled */ + if (!wr->pov) + wr->px &= !(wr->pwxn && wr->pw); + + /* R_NPBXC, S1UnprivOverlay disabled */ + if (!wr->uov) + wr->ux &= !(wr->uwxn && wr->uw); + + pan = wi->pan && (wr->ur || wr->uw || + (pan3_enabled(vcpu, wi->regime) && wr->ux)); + wr->pw &= !pan; + wr->pr &= !pan; +} + +static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) +{ + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + bool perm_fail = false; + int ret, idx; + + wi.regime = compute_translation_regime(vcpu, op); + wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); + + ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + /* + * Race to update a descriptor -- restart the walk. + */ + if (ret == -EAGAIN) + return ret; + if (ret) + goto compute_par; + + compute_s1_permissions(vcpu, &wi, &wr); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1R: + case OP_AT_S1E2R: + perm_fail = !wr.pr; + break; + case OP_AT_S1E1WP: + case OP_AT_S1E1W: + case OP_AT_S1E2W: + perm_fail = !wr.pw; + break; + case OP_AT_S1E0R: + perm_fail = !wr.ur; + break; + case OP_AT_S1E0W: + perm_fail = !wr.uw; + break; + case OP_AT_S1E1A: + case OP_AT_S1E2A: + break; + default: + BUG(); + } + + if (perm_fail) + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); + +compute_par: + *par = compute_par_s1(vcpu, &wi, &wr); + return 0; +} + +/* + * Return the PAR_EL1 value as the result of a valid translation. + * + * If the translation is unsuccessful, the value may only contain + * PAR_EL1.F, and cannot be taken at face value. It isn't an + * indication of the translation having failed, only that the fast + * path did not succeed, *unless* it indicates a S1 permission or + * access fault. + */ +static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct mmu_config config; + struct kvm_s2_mmu *mmu; + bool fail, mmu_cs; + u64 par; + + par = SYS_PAR_EL1_F; + + /* + * We've trapped, so everything is live on the CPU. As we will + * be switching contexts behind everybody's back, disable + * interrupts while holding the mmu lock. + */ + guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); + + /* + * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already + * the right one (as we trapped from vEL2). If not, save the + * full MMU context. + * + * We are also guaranteed to be in the correct context if + * we're not in a nested VM. + */ + mmu_cs = (vcpu_has_nv(vcpu) && + !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); + if (!mmu_cs) + goto skip_mmu_switch; + + /* + * Obtaining the S2 MMU for a L2 is horribly racy, and we may not + * find it (recycled by another vcpu, for example). When this + * happens, admit defeat immediately and use the SW (slow) path. + */ + mmu = lookup_s2_mmu(vcpu); + if (!mmu) + return par; + + __mmu_config_save(&config); + + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + if (kvm_has_tcr2(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); + if (kvm_has_s1pie(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); + } + if (kvm_has_s1poe(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); + write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); + } + } + write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); + __load_stage2(mmu, mmu->arch); + +skip_mmu_switch: + /* Temporarily switch back to guest context */ + write_sysreg_hcr(vcpu->arch.hcr_el2); + isb(); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1WP: + fail = at_s1e1p_fast(vcpu, op, vaddr); + break; + case OP_AT_S1E1R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E1W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E0R: + fail = __kvm_at(OP_AT_S1E0R, vaddr); + break; + case OP_AT_S1E0W: + fail = __kvm_at(OP_AT_S1E0W, vaddr); + break; + case OP_AT_S1E1A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + break; + } + + if (!fail) + par = read_sysreg_par(); + + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); + + if (mmu_cs) + __mmu_config_restore(&config); + + return par; +} + +static bool par_check_s1_perm_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && + !(par & SYS_PAR_EL1_S)); +} + +static bool par_check_s1_access_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS && + !(par & SYS_PAR_EL1_S)); +} + +int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + int ret; + + /* + * If PAR_EL1 reports that AT failed on a S1 permission or access + * fault, we know for sure that the PTW was able to walk the S1 + * tables and there's nothing else to do. + * + * If AT failed for any other reason, then we must walk the guest S1 + * to emulate the instruction. + */ + if ((par & SYS_PAR_EL1_F) && + !par_check_s1_perm_fault(par) && + !par_check_s1_access_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; +} + +int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par; + int ret; + + /* + * We've trapped, so everything is live on the CPU. As we will be + * switching context behind everybody's back, disable interrupts... + */ + scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { + u64 val, hcr; + bool fail; + + val = hcr = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + val |= HCR_VM; + + if (!vcpu_el2_e2h_is_set(vcpu)) + val |= HCR_NV | HCR_NV1; + + write_sysreg_hcr(val); + isb(); + + par = SYS_PAR_EL1_F; + + switch (op) { + case OP_AT_S1E2R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E2W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E2A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + } + + isb(); + + if (!fail) + par = read_sysreg_par(); + + write_sysreg_hcr(hcr); + isb(); + } + + /* We failed the translation, let's replay it in slow motion */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; +} + +int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct kvm_s2_trans out = {}; + u64 ipa, par; + bool write; + int ret; + + /* Do the stage-1 translation */ + switch (op) { + case OP_AT_S12E1R: + op = OP_AT_S1E1R; + write = false; + break; + case OP_AT_S12E1W: + op = OP_AT_S1E1W; + write = true; + break; + case OP_AT_S12E0R: + op = OP_AT_S1E0R; + write = false; + break; + case OP_AT_S12E0W: + op = OP_AT_S1E0W; + write = true; + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + __kvm_at_s1e01(vcpu, op, vaddr); + par = vcpu_read_sys_reg(vcpu, PAR_EL1); + if (par & SYS_PAR_EL1_F) + return 0; + + /* + * If we only have a single stage of translation (EL2&0), exit + * early. Same thing if {VM,DC}=={0,0}. + */ + if (compute_translation_regime(vcpu, op) == TR_EL20 || + !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) + return 0; + + /* Do the stage-2 translation */ + ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); + out.esr = 0; + ret = kvm_walk_nested_s2(vcpu, ipa, &out); + if (ret < 0) + return ret; + + /* Check the access permission */ + if (!out.esr && + ((!write && !out.readable) || (write && !out.writable))) + out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); + + par = compute_par_s12(vcpu, par, &out); + vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; +} + +/* + * Translate a VA for a given EL in a given translation regime, with + * or without PAN. This requires wi->{regime, as_el0, pan} to be + * set. The rest of the wi and wr should be 0-initialised. + */ +int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + int ret; + + ret = setup_s1_walk(vcpu, wi, wr, va); + if (ret) + return ret; + + if (wr->level == S1_MMU_DISABLED) { + wr->ur = wr->uw = wr->ux = true; + wr->pr = wr->pw = wr->px = true; + } else { + ret = walk_s1(vcpu, wi, wr, va); + if (ret) + return ret; + + compute_s1_permissions(vcpu, wi, wr); + } + + return 0; +} + +struct desc_match { + u64 ipa; + int level; +}; + +static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) +{ + struct desc_match *dm = priv; + u64 ipa = dm->ipa; + + /* Use S1 granule alignment */ + ipa &= GENMASK(51, ctxt->wi->pgshift); + + /* Not the IPA we're looking for? Continue. */ + if (ipa != ctxt->table_ipa) + return 0; + + /* Note the level and interrupt the walk */ + dm->level = ctxt->level; + return -EINTR; +} + +int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) +{ + struct desc_match dm = { + .ipa = ipa, + }; + struct s1_walk_info wi = { + .filter = &(struct s1_walk_filter){ + .fn = match_s1_desc, + .priv = &dm, + }, + .as_el0 = false, + .pan = false, + }; + struct s1_walk_result wr = {}; + int ret; + + if (is_hyp_ctxt(vcpu)) + wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + else + wi.regime = TR_EL10; + + ret = setup_s1_walk(vcpu, &wi, &wr, va); + if (ret) + return ret; + + /* We really expect the S1 MMU to be on here... */ + if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { + *level = 0; + return 0; + } + + /* Walk the guest's PT, looking for a match along the way */ + ret = walk_s1(vcpu, &wi, &wr, va); + switch (ret) { + case -EINTR: + /* We interrupted the walk on a match, return the level */ + *level = dm.level; + return 0; + case 0: + /* The walk completed, we failed to find the entry */ + return -ENOENT; + default: + /* Any other error... */ + return ret; + } +} + +#ifdef CONFIG_ARM64_LSE_ATOMICS +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + u64 tmp = old; + int ret = 0; + + uaccess_enable_privileged(); + + asm volatile(__LSE_PREAMBLE + "1: cas %[old], %[new], %[addr]\n" + "2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) + : [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + if (ret) + return ret; + if (tmp != old) + return -EAGAIN; + + return ret; +} +#else +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + return -EINVAL; +} +#endif + +static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + int ret = 1; + u64 tmp; + + uaccess_enable_privileged(); + + asm volatile("prfm pstl1strm, %[addr]\n" + "1: ldxr %[tmp], %[addr]\n" + "sub %[tmp], %[tmp], %[old]\n" + "cbnz %[tmp], 3f\n" + "2: stlxr %w[ret], %[new], %[addr]\n" + "3:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) + : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) + : [old] "r" (old), [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + /* STLXR didn't update the descriptor, or the compare failed */ + if (ret == 1) + return -EAGAIN; + + return ret; +} + +int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + u64 __user *ptep; + bool writable; + int offset; + gfn_t gfn; + int r; + + lockdep_assert(srcu_read_lock_held(&kvm->srcu)); + + gfn = ipa >> PAGE_SHIFT; + offset = offset_in_page(ipa); + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return -EINVAL; + if (!writable) + return -EPERM; + + ptep = (u64 __user *)hva + offset; + if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) + r = __lse_swap_desc(ptep, old, new); + else + r = __llsc_swap_desc(ptep, old, new); + + if (r < 0) + return r; + + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; +} diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c new file mode 100644 index 000000000000..24bb3f36e9d5 --- /dev/null +++ b/arch/arm64/kvm/config.c @@ -0,0 +1,1520 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> +#include <asm/sysreg.h> + +/* + * Describes the dependencies between a set of bits (or the negation + * of a set of RES0 bits) and a feature. The flags indicate how the + * data is interpreted. + */ +struct reg_bits_to_feat_map { + union { + u64 bits; + u64 *res0p; + }; + +#define NEVER_FGU BIT(0) /* Can trap, but never UNDEF */ +#define CALL_FUNC BIT(1) /* Needs to evaluate tons of crap */ +#define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */ +#define RES0_POINTER BIT(3) /* Pointer to RES0 value instead of bits */ + + unsigned long flags; + + union { + struct { + u8 regidx; + u8 shift; + u8 width; + bool sign; + s8 lo_lim; + }; + bool (*match)(struct kvm *); + bool (*fval)(struct kvm *, u64 *); + }; +}; + +/* + * Describes the dependencies for a given register: + * + * @feat_map describes the dependency for the whole register. If the + * features the register depends on are not present, the whole + * register is effectively RES0. + * + * @bit_feat_map describes the dependencies for a set of bits in that + * register. If the features these bits depend on are not present, the + * bits are effectively RES0. + */ +struct reg_feat_map_desc { + const char *name; + const struct reg_bits_to_feat_map feat_map; + const struct reg_bits_to_feat_map *bit_feat_map; + const unsigned int bit_feat_map_sz; +}; + +#define __NEEDS_FEAT_3(m, f, w, id, fld, lim) \ + { \ + .w = (m), \ + .flags = (f), \ + .regidx = IDREG_IDX(SYS_ ## id), \ + .shift = id ##_## fld ## _SHIFT, \ + .width = id ##_## fld ## _WIDTH, \ + .sign = id ##_## fld ## _SIGNED, \ + .lo_lim = id ##_## fld ##_## lim \ + } + +#define __NEEDS_FEAT_2(m, f, w, fun, dummy) \ + { \ + .w = (m), \ + .flags = (f) | CALL_FUNC, \ + .fval = (fun), \ + } + +#define __NEEDS_FEAT_1(m, f, w, fun) \ + { \ + .w = (m), \ + .flags = (f) | CALL_FUNC, \ + .match = (fun), \ + } + +#define __NEEDS_FEAT_FLAG(m, f, w, ...) \ + CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, w, __VA_ARGS__) + +#define NEEDS_FEAT_FLAG(m, f, ...) \ + __NEEDS_FEAT_FLAG(m, f, bits, __VA_ARGS__) + +#define NEEDS_FEAT_FIXED(m, ...) \ + __NEEDS_FEAT_FLAG(m, FIXED_VALUE, bits, __VA_ARGS__, 0) + +#define NEEDS_FEAT_RES0(p, ...) \ + __NEEDS_FEAT_FLAG(p, RES0_POINTER, res0p, __VA_ARGS__) + +/* + * Declare the dependency between a set of bits and a set of features, + * generating a struct reg_bit_to_feat_map. + */ +#define NEEDS_FEAT(m, ...) NEEDS_FEAT_FLAG(m, 0, __VA_ARGS__) + +/* + * Declare the dependency between a non-FGT register, a set of + * feature, and the set of individual bits it contains. This generates + * a struct reg_feat_map_desc. + */ +#define DECLARE_FEAT_MAP(n, r, m, f) \ + struct reg_feat_map_desc n = { \ + .name = #r, \ + .feat_map = NEEDS_FEAT(~r##_RES0, f), \ + .bit_feat_map = m, \ + .bit_feat_map_sz = ARRAY_SIZE(m), \ + } + +/* + * Specialised version of the above for FGT registers that have their + * RES0 masks described as struct fgt_masks. + */ +#define DECLARE_FEAT_MAP_FGT(n, msk, m, f) \ + struct reg_feat_map_desc n = { \ + .name = #msk, \ + .feat_map = NEEDS_FEAT_RES0(&msk.res0, f),\ + .bit_feat_map = m, \ + .bit_feat_map_sz = ARRAY_SIZE(m), \ + } + +#define FEAT_SPE ID_AA64DFR0_EL1, PMSVer, IMP +#define FEAT_SPE_FnE ID_AA64DFR0_EL1, PMSVer, V1P2 +#define FEAT_BRBE ID_AA64DFR0_EL1, BRBE, IMP +#define FEAT_TRC_SR ID_AA64DFR0_EL1, TraceVer, IMP +#define FEAT_PMUv3 ID_AA64DFR0_EL1, PMUVer, IMP +#define FEAT_TRBE ID_AA64DFR0_EL1, TraceBuffer, IMP +#define FEAT_TRBEv1p1 ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1 +#define FEAT_DoubleLock ID_AA64DFR0_EL1, DoubleLock, IMP +#define FEAT_TRF ID_AA64DFR0_EL1, TraceFilt, IMP +#define FEAT_AA32EL0 ID_AA64PFR0_EL1, EL0, AARCH32 +#define FEAT_AA32EL1 ID_AA64PFR0_EL1, EL1, AARCH32 +#define FEAT_AA64EL1 ID_AA64PFR0_EL1, EL1, IMP +#define FEAT_AA64EL2 ID_AA64PFR0_EL1, EL2, IMP +#define FEAT_AA64EL3 ID_AA64PFR0_EL1, EL3, IMP +#define FEAT_AIE ID_AA64MMFR3_EL1, AIE, IMP +#define FEAT_S2POE ID_AA64MMFR3_EL1, S2POE, IMP +#define FEAT_S1POE ID_AA64MMFR3_EL1, S1POE, IMP +#define FEAT_S1PIE ID_AA64MMFR3_EL1, S1PIE, IMP +#define FEAT_THE ID_AA64PFR1_EL1, THE, IMP +#define FEAT_SME ID_AA64PFR1_EL1, SME, IMP +#define FEAT_GCS ID_AA64PFR1_EL1, GCS, IMP +#define FEAT_LS64 ID_AA64ISAR1_EL1, LS64, LS64 +#define FEAT_LS64_V ID_AA64ISAR1_EL1, LS64, LS64_V +#define FEAT_LS64_ACCDATA ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA +#define FEAT_RAS ID_AA64PFR0_EL1, RAS, IMP +#define FEAT_RASv2 ID_AA64PFR0_EL1, RAS, V2 +#define FEAT_GICv3 ID_AA64PFR0_EL1, GIC, IMP +#define FEAT_LOR ID_AA64MMFR1_EL1, LO, IMP +#define FEAT_SPEv1p2 ID_AA64DFR0_EL1, PMSVer, V1P2 +#define FEAT_SPEv1p4 ID_AA64DFR0_EL1, PMSVer, V1P4 +#define FEAT_SPEv1p5 ID_AA64DFR0_EL1, PMSVer, V1P5 +#define FEAT_ATS1A ID_AA64ISAR2_EL1, ATS1A, IMP +#define FEAT_SPECRES2 ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX +#define FEAT_SPECRES ID_AA64ISAR1_EL1, SPECRES, IMP +#define FEAT_TLBIRANGE ID_AA64ISAR0_EL1, TLB, RANGE +#define FEAT_TLBIOS ID_AA64ISAR0_EL1, TLB, OS +#define FEAT_PAN2 ID_AA64MMFR1_EL1, PAN, PAN2 +#define FEAT_DPB2 ID_AA64ISAR1_EL1, DPB, DPB2 +#define FEAT_AMUv1 ID_AA64PFR0_EL1, AMU, IMP +#define FEAT_AMUv1p1 ID_AA64PFR0_EL1, AMU, V1P1 +#define FEAT_CMOW ID_AA64MMFR1_EL1, CMOW, IMP +#define FEAT_D128 ID_AA64MMFR3_EL1, D128, IMP +#define FEAT_DoubleFault2 ID_AA64PFR1_EL1, DF2, IMP +#define FEAT_FPMR ID_AA64PFR2_EL1, FPMR, IMP +#define FEAT_MOPS ID_AA64ISAR2_EL1, MOPS, IMP +#define FEAT_NMI ID_AA64PFR1_EL1, NMI, IMP +#define FEAT_SCTLR2 ID_AA64MMFR3_EL1, SCTLRX, IMP +#define FEAT_SYSREG128 ID_AA64ISAR2_EL1, SYSREG_128, IMP +#define FEAT_TCR2 ID_AA64MMFR3_EL1, TCRX, IMP +#define FEAT_XS ID_AA64ISAR1_EL1, XS, IMP +#define FEAT_EVT ID_AA64MMFR2_EL1, EVT, IMP +#define FEAT_EVT_TTLBxS ID_AA64MMFR2_EL1, EVT, TTLBxS +#define FEAT_MTE2 ID_AA64PFR1_EL1, MTE, MTE2 +#define FEAT_RME ID_AA64PFR0_EL1, RME, IMP +#define FEAT_MPAM ID_AA64PFR0_EL1, MPAM, 1 +#define FEAT_S2FWB ID_AA64MMFR2_EL1, FWB, IMP +#define FEAT_TME ID_AA64ISAR0_EL1, TME, IMP +#define FEAT_TWED ID_AA64MMFR1_EL1, TWED, IMP +#define FEAT_E2H0 ID_AA64MMFR4_EL1, E2H0, IMP +#define FEAT_SRMASK ID_AA64MMFR4_EL1, SRMASK, IMP +#define FEAT_PoPS ID_AA64MMFR4_EL1, PoPS, IMP +#define FEAT_PFAR ID_AA64PFR1_EL1, PFAR, IMP +#define FEAT_Debugv8p9 ID_AA64DFR0_EL1, PMUVer, V3P9 +#define FEAT_PMUv3_SS ID_AA64DFR0_EL1, PMSS, IMP +#define FEAT_SEBEP ID_AA64DFR0_EL1, SEBEP, IMP +#define FEAT_EBEP ID_AA64DFR1_EL1, EBEP, IMP +#define FEAT_ITE ID_AA64DFR1_EL1, ITE, IMP +#define FEAT_PMUv3_ICNTR ID_AA64DFR1_EL1, PMICNTR, IMP +#define FEAT_SPMU ID_AA64DFR1_EL1, SPMU, IMP +#define FEAT_SPE_nVM ID_AA64DFR2_EL1, SPE_nVM, IMP +#define FEAT_STEP2 ID_AA64DFR2_EL1, STEP, IMP +#define FEAT_CPA2 ID_AA64ISAR3_EL1, CPA, CPA2 +#define FEAT_ASID2 ID_AA64MMFR4_EL1, ASID2, IMP +#define FEAT_MEC ID_AA64MMFR3_EL1, MEC, IMP +#define FEAT_HAFT ID_AA64MMFR1_EL1, HAFDBS, HAFT +#define FEAT_BTI ID_AA64PFR1_EL1, BT, IMP +#define FEAT_ExS ID_AA64MMFR0_EL1, EXS, IMP +#define FEAT_IESB ID_AA64MMFR2_EL1, IESB, IMP +#define FEAT_LSE2 ID_AA64MMFR2_EL1, AT, IMP +#define FEAT_LSMAOC ID_AA64MMFR2_EL1, LSM, IMP +#define FEAT_MixedEnd ID_AA64MMFR0_EL1, BIGEND, IMP +#define FEAT_MixedEndEL0 ID_AA64MMFR0_EL1, BIGENDEL0, IMP +#define FEAT_MTE_ASYNC ID_AA64PFR1_EL1, MTE_frac, ASYNC +#define FEAT_MTE_STORE_ONLY ID_AA64PFR2_EL1, MTESTOREONLY, IMP +#define FEAT_PAN ID_AA64MMFR1_EL1, PAN, IMP +#define FEAT_PAN3 ID_AA64MMFR1_EL1, PAN, PAN3 +#define FEAT_SSBS ID_AA64PFR1_EL1, SSBS, IMP +#define FEAT_TIDCP1 ID_AA64MMFR1_EL1, TIDCP1, IMP +#define FEAT_FGT ID_AA64MMFR0_EL1, FGT, IMP +#define FEAT_FGT2 ID_AA64MMFR0_EL1, FGT, FGT2 +#define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP +#define FEAT_HCX ID_AA64MMFR1_EL1, HCX, IMP + +static bool not_feat_aa64el3(struct kvm *kvm) +{ + return !kvm_has_feat(kvm, FEAT_AA64EL3); +} + +static bool feat_nv2(struct kvm *kvm) +{ + return ((kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) && + kvm_has_feat_enum(kvm, ID_AA64MMFR2_EL1, NV, NI)) || + kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2)); +} + +static bool feat_nv2_e2h0_ni(struct kvm *kvm) +{ + return feat_nv2(kvm) && !kvm_has_feat(kvm, FEAT_E2H0); +} + +static bool feat_rasv1p1(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) || + (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) && + kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1))); +} + +static bool feat_csv2_2_csv2_1p2(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) || + (kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2) && + kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, CSV2, IMP))); +} + +static bool feat_pauth(struct kvm *kvm) +{ + return kvm_has_pauth(kvm, PAuth); +} + +static bool feat_pauth_lr(struct kvm *kvm) +{ + return kvm_has_pauth(kvm, PAuth_LR); +} + +static bool feat_aderr(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ADERR, FEAT_ADERR) && + kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SDERR, FEAT_ADERR)); +} + +static bool feat_anerr(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ANERR, FEAT_ANERR) && + kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SNERR, FEAT_ANERR)); +} + +static bool feat_sme_smps(struct kvm *kvm) +{ + /* + * Revists this if KVM ever supports SME -- this really should + * look at the guest's view of SMIDR_EL1. Funnily enough, this + * is not captured in the JSON file, but only as a note in the + * ARM ARM. + */ + return (kvm_has_feat(kvm, FEAT_SME) && + (read_sysreg_s(SYS_SMIDR_EL1) & SMIDR_EL1_SMPS)); +} + +static bool feat_spe_fds(struct kvm *kvm) +{ + /* + * Revists this if KVM ever supports SPE -- this really should + * look at the guest's view of PMSIDR_EL1. + */ + return (kvm_has_feat(kvm, FEAT_SPEv1p4) && + (read_sysreg_s(SYS_PMSIDR_EL1) & PMSIDR_EL1_FDS)); +} + +static bool feat_trbe_mpam(struct kvm *kvm) +{ + /* + * Revists this if KVM ever supports both MPAM and TRBE -- + * this really should look at the guest's view of TRBIDR_EL1. + */ + return (kvm_has_feat(kvm, FEAT_TRBE) && + kvm_has_feat(kvm, FEAT_MPAM) && + (read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_MPAM)); +} + +static bool feat_asid2_e2h1(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_ASID2) && !kvm_has_feat(kvm, FEAT_E2H0); +} + +static bool feat_d128_e2h1(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_D128) && !kvm_has_feat(kvm, FEAT_E2H0); +} + +static bool feat_mec_e2h1(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_MEC) && !kvm_has_feat(kvm, FEAT_E2H0); +} + +static bool feat_ebep_pmuv3_ss(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_EBEP) || kvm_has_feat(kvm, FEAT_PMUv3_SS); +} + +static bool feat_mixedendel0(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_MixedEnd) || kvm_has_feat(kvm, FEAT_MixedEndEL0); +} + +static bool feat_mte_async(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_MTE2) && kvm_has_feat_enum(kvm, FEAT_MTE_ASYNC); +} + +#define check_pmu_revision(k, r) \ + ({ \ + (kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, r) && \ + !kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, IMP_DEF)); \ + }) + +static bool feat_pmuv3p1(struct kvm *kvm) +{ + return check_pmu_revision(kvm, V3P1); +} + +static bool feat_pmuv3p5(struct kvm *kvm) +{ + return check_pmu_revision(kvm, V3P5); +} + +static bool feat_pmuv3p7(struct kvm *kvm) +{ + return check_pmu_revision(kvm, V3P7); +} + +static bool feat_pmuv3p9(struct kvm *kvm) +{ + return check_pmu_revision(kvm, V3P9); +} + +static bool compute_hcr_rw(struct kvm *kvm, u64 *bits) +{ + /* This is purely academic: AArch32 and NV are mutually exclusive */ + if (bits) { + if (kvm_has_feat(kvm, FEAT_AA32EL1)) + *bits &= ~HCR_EL2_RW; + else + *bits |= HCR_EL2_RW; + } + + return true; +} + +static bool compute_hcr_e2h(struct kvm *kvm, u64 *bits) +{ + if (bits) { + if (kvm_has_feat(kvm, FEAT_E2H0)) + *bits &= ~HCR_EL2_E2H; + else + *bits |= HCR_EL2_E2H; + } + + return true; +} + +static const struct reg_bits_to_feat_map hfgrtr_feat_map[] = { + NEEDS_FEAT(HFGRTR_EL2_nAMAIR2_EL1 | + HFGRTR_EL2_nMAIR2_EL1, + FEAT_AIE), + NEEDS_FEAT(HFGRTR_EL2_nS2POR_EL1, FEAT_S2POE), + NEEDS_FEAT(HFGRTR_EL2_nPOR_EL1 | + HFGRTR_EL2_nPOR_EL0, + FEAT_S1POE), + NEEDS_FEAT(HFGRTR_EL2_nPIR_EL1 | + HFGRTR_EL2_nPIRE0_EL1, + FEAT_S1PIE), + NEEDS_FEAT(HFGRTR_EL2_nRCWMASK_EL1, FEAT_THE), + NEEDS_FEAT(HFGRTR_EL2_nTPIDR2_EL0 | + HFGRTR_EL2_nSMPRI_EL1, + FEAT_SME), + NEEDS_FEAT(HFGRTR_EL2_nGCS_EL1 | + HFGRTR_EL2_nGCS_EL0, + FEAT_GCS), + NEEDS_FEAT(HFGRTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA), + NEEDS_FEAT(HFGRTR_EL2_ERXADDR_EL1 | + HFGRTR_EL2_ERXMISCn_EL1 | + HFGRTR_EL2_ERXSTATUS_EL1 | + HFGRTR_EL2_ERXCTLR_EL1 | + HFGRTR_EL2_ERXFR_EL1 | + HFGRTR_EL2_ERRSELR_EL1 | + HFGRTR_EL2_ERRIDR_EL1, + FEAT_RAS), + NEEDS_FEAT(HFGRTR_EL2_ERXPFGCDN_EL1 | + HFGRTR_EL2_ERXPFGCTL_EL1 | + HFGRTR_EL2_ERXPFGF_EL1, + feat_rasv1p1), + NEEDS_FEAT(HFGRTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3), + NEEDS_FEAT(HFGRTR_EL2_SCXTNUM_EL0 | + HFGRTR_EL2_SCXTNUM_EL1, + feat_csv2_2_csv2_1p2), + NEEDS_FEAT(HFGRTR_EL2_LORSA_EL1 | + HFGRTR_EL2_LORN_EL1 | + HFGRTR_EL2_LORID_EL1 | + HFGRTR_EL2_LOREA_EL1 | + HFGRTR_EL2_LORC_EL1, + FEAT_LOR), + NEEDS_FEAT(HFGRTR_EL2_APIBKey | + HFGRTR_EL2_APIAKey | + HFGRTR_EL2_APGAKey | + HFGRTR_EL2_APDBKey | + HFGRTR_EL2_APDAKey, + feat_pauth), + NEEDS_FEAT_FLAG(HFGRTR_EL2_VBAR_EL1 | + HFGRTR_EL2_TTBR1_EL1 | + HFGRTR_EL2_TTBR0_EL1 | + HFGRTR_EL2_TPIDR_EL0 | + HFGRTR_EL2_TPIDRRO_EL0 | + HFGRTR_EL2_TPIDR_EL1 | + HFGRTR_EL2_TCR_EL1 | + HFGRTR_EL2_SCTLR_EL1 | + HFGRTR_EL2_REVIDR_EL1 | + HFGRTR_EL2_PAR_EL1 | + HFGRTR_EL2_MPIDR_EL1 | + HFGRTR_EL2_MIDR_EL1 | + HFGRTR_EL2_MAIR_EL1 | + HFGRTR_EL2_ISR_EL1 | + HFGRTR_EL2_FAR_EL1 | + HFGRTR_EL2_ESR_EL1 | + HFGRTR_EL2_DCZID_EL0 | + HFGRTR_EL2_CTR_EL0 | + HFGRTR_EL2_CSSELR_EL1 | + HFGRTR_EL2_CPACR_EL1 | + HFGRTR_EL2_CONTEXTIDR_EL1| + HFGRTR_EL2_CLIDR_EL1 | + HFGRTR_EL2_CCSIDR_EL1 | + HFGRTR_EL2_AMAIR_EL1 | + HFGRTR_EL2_AIDR_EL1 | + HFGRTR_EL2_AFSR1_EL1 | + HFGRTR_EL2_AFSR0_EL1, + NEVER_FGU, FEAT_AA64EL1), +}; + + +static const DECLARE_FEAT_MAP_FGT(hfgrtr_desc, hfgrtr_masks, + hfgrtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hfgwtr_feat_map[] = { + NEEDS_FEAT(HFGWTR_EL2_nAMAIR2_EL1 | + HFGWTR_EL2_nMAIR2_EL1, + FEAT_AIE), + NEEDS_FEAT(HFGWTR_EL2_nS2POR_EL1, FEAT_S2POE), + NEEDS_FEAT(HFGWTR_EL2_nPOR_EL1 | + HFGWTR_EL2_nPOR_EL0, + FEAT_S1POE), + NEEDS_FEAT(HFGWTR_EL2_nPIR_EL1 | + HFGWTR_EL2_nPIRE0_EL1, + FEAT_S1PIE), + NEEDS_FEAT(HFGWTR_EL2_nRCWMASK_EL1, FEAT_THE), + NEEDS_FEAT(HFGWTR_EL2_nTPIDR2_EL0 | + HFGWTR_EL2_nSMPRI_EL1, + FEAT_SME), + NEEDS_FEAT(HFGWTR_EL2_nGCS_EL1 | + HFGWTR_EL2_nGCS_EL0, + FEAT_GCS), + NEEDS_FEAT(HFGWTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA), + NEEDS_FEAT(HFGWTR_EL2_ERXADDR_EL1 | + HFGWTR_EL2_ERXMISCn_EL1 | + HFGWTR_EL2_ERXSTATUS_EL1 | + HFGWTR_EL2_ERXCTLR_EL1 | + HFGWTR_EL2_ERRSELR_EL1, + FEAT_RAS), + NEEDS_FEAT(HFGWTR_EL2_ERXPFGCDN_EL1 | + HFGWTR_EL2_ERXPFGCTL_EL1, + feat_rasv1p1), + NEEDS_FEAT(HFGWTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3), + NEEDS_FEAT(HFGWTR_EL2_SCXTNUM_EL0 | + HFGWTR_EL2_SCXTNUM_EL1, + feat_csv2_2_csv2_1p2), + NEEDS_FEAT(HFGWTR_EL2_LORSA_EL1 | + HFGWTR_EL2_LORN_EL1 | + HFGWTR_EL2_LOREA_EL1 | + HFGWTR_EL2_LORC_EL1, + FEAT_LOR), + NEEDS_FEAT(HFGWTR_EL2_APIBKey | + HFGWTR_EL2_APIAKey | + HFGWTR_EL2_APGAKey | + HFGWTR_EL2_APDBKey | + HFGWTR_EL2_APDAKey, + feat_pauth), + NEEDS_FEAT_FLAG(HFGWTR_EL2_VBAR_EL1 | + HFGWTR_EL2_TTBR1_EL1 | + HFGWTR_EL2_TTBR0_EL1 | + HFGWTR_EL2_TPIDR_EL0 | + HFGWTR_EL2_TPIDRRO_EL0 | + HFGWTR_EL2_TPIDR_EL1 | + HFGWTR_EL2_TCR_EL1 | + HFGWTR_EL2_SCTLR_EL1 | + HFGWTR_EL2_PAR_EL1 | + HFGWTR_EL2_MAIR_EL1 | + HFGWTR_EL2_FAR_EL1 | + HFGWTR_EL2_ESR_EL1 | + HFGWTR_EL2_CSSELR_EL1 | + HFGWTR_EL2_CPACR_EL1 | + HFGWTR_EL2_CONTEXTIDR_EL1| + HFGWTR_EL2_AMAIR_EL1 | + HFGWTR_EL2_AFSR1_EL1 | + HFGWTR_EL2_AFSR0_EL1, + NEVER_FGU, FEAT_AA64EL1), +}; + +static const DECLARE_FEAT_MAP_FGT(hfgwtr_desc, hfgwtr_masks, + hfgwtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hdfgrtr_feat_map[] = { + NEEDS_FEAT(HDFGRTR_EL2_PMBIDR_EL1 | + HDFGRTR_EL2_PMSLATFR_EL1 | + HDFGRTR_EL2_PMSIRR_EL1 | + HDFGRTR_EL2_PMSIDR_EL1 | + HDFGRTR_EL2_PMSICR_EL1 | + HDFGRTR_EL2_PMSFCR_EL1 | + HDFGRTR_EL2_PMSEVFR_EL1 | + HDFGRTR_EL2_PMSCR_EL1 | + HDFGRTR_EL2_PMBSR_EL1 | + HDFGRTR_EL2_PMBPTR_EL1 | + HDFGRTR_EL2_PMBLIMITR_EL1, + FEAT_SPE), + NEEDS_FEAT(HDFGRTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE), + NEEDS_FEAT(HDFGRTR_EL2_nBRBDATA | + HDFGRTR_EL2_nBRBCTL | + HDFGRTR_EL2_nBRBIDR, + FEAT_BRBE), + NEEDS_FEAT(HDFGRTR_EL2_TRCVICTLR | + HDFGRTR_EL2_TRCSTATR | + HDFGRTR_EL2_TRCSSCSRn | + HDFGRTR_EL2_TRCSEQSTR | + HDFGRTR_EL2_TRCPRGCTLR | + HDFGRTR_EL2_TRCOSLSR | + HDFGRTR_EL2_TRCIMSPECn | + HDFGRTR_EL2_TRCID | + HDFGRTR_EL2_TRCCNTVRn | + HDFGRTR_EL2_TRCCLAIM | + HDFGRTR_EL2_TRCAUXCTLR | + HDFGRTR_EL2_TRCAUTHSTATUS | + HDFGRTR_EL2_TRC, + FEAT_TRC_SR), + NEEDS_FEAT(HDFGRTR_EL2_PMCEIDn_EL0 | + HDFGRTR_EL2_PMUSERENR_EL0 | + HDFGRTR_EL2_PMMIR_EL1 | + HDFGRTR_EL2_PMSELR_EL0 | + HDFGRTR_EL2_PMOVS | + HDFGRTR_EL2_PMINTEN | + HDFGRTR_EL2_PMCNTEN | + HDFGRTR_EL2_PMCCNTR_EL0 | + HDFGRTR_EL2_PMCCFILTR_EL0 | + HDFGRTR_EL2_PMEVTYPERn_EL0 | + HDFGRTR_EL2_PMEVCNTRn_EL0, + FEAT_PMUv3), + NEEDS_FEAT(HDFGRTR_EL2_TRBTRG_EL1 | + HDFGRTR_EL2_TRBSR_EL1 | + HDFGRTR_EL2_TRBPTR_EL1 | + HDFGRTR_EL2_TRBMAR_EL1 | + HDFGRTR_EL2_TRBLIMITR_EL1 | + HDFGRTR_EL2_TRBIDR_EL1 | + HDFGRTR_EL2_TRBBASER_EL1, + FEAT_TRBE), + NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSDLR_EL1, NEVER_FGU, + FEAT_DoubleLock), + NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSECCR_EL1 | + HDFGRTR_EL2_OSLSR_EL1 | + HDFGRTR_EL2_DBGPRCR_EL1 | + HDFGRTR_EL2_DBGAUTHSTATUS_EL1| + HDFGRTR_EL2_DBGCLAIM | + HDFGRTR_EL2_MDSCR_EL1 | + HDFGRTR_EL2_DBGWVRn_EL1 | + HDFGRTR_EL2_DBGWCRn_EL1 | + HDFGRTR_EL2_DBGBVRn_EL1 | + HDFGRTR_EL2_DBGBCRn_EL1, + NEVER_FGU, FEAT_AA64EL1) +}; + +static const DECLARE_FEAT_MAP_FGT(hdfgrtr_desc, hdfgrtr_masks, + hdfgrtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hdfgwtr_feat_map[] = { + NEEDS_FEAT(HDFGWTR_EL2_PMSLATFR_EL1 | + HDFGWTR_EL2_PMSIRR_EL1 | + HDFGWTR_EL2_PMSICR_EL1 | + HDFGWTR_EL2_PMSFCR_EL1 | + HDFGWTR_EL2_PMSEVFR_EL1 | + HDFGWTR_EL2_PMSCR_EL1 | + HDFGWTR_EL2_PMBSR_EL1 | + HDFGWTR_EL2_PMBPTR_EL1 | + HDFGWTR_EL2_PMBLIMITR_EL1, + FEAT_SPE), + NEEDS_FEAT(HDFGWTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE), + NEEDS_FEAT(HDFGWTR_EL2_nBRBDATA | + HDFGWTR_EL2_nBRBCTL, + FEAT_BRBE), + NEEDS_FEAT(HDFGWTR_EL2_TRCVICTLR | + HDFGWTR_EL2_TRCSSCSRn | + HDFGWTR_EL2_TRCSEQSTR | + HDFGWTR_EL2_TRCPRGCTLR | + HDFGWTR_EL2_TRCOSLAR | + HDFGWTR_EL2_TRCIMSPECn | + HDFGWTR_EL2_TRCCNTVRn | + HDFGWTR_EL2_TRCCLAIM | + HDFGWTR_EL2_TRCAUXCTLR | + HDFGWTR_EL2_TRC, + FEAT_TRC_SR), + NEEDS_FEAT(HDFGWTR_EL2_PMUSERENR_EL0 | + HDFGWTR_EL2_PMCR_EL0 | + HDFGWTR_EL2_PMSWINC_EL0 | + HDFGWTR_EL2_PMSELR_EL0 | + HDFGWTR_EL2_PMOVS | + HDFGWTR_EL2_PMINTEN | + HDFGWTR_EL2_PMCNTEN | + HDFGWTR_EL2_PMCCNTR_EL0 | + HDFGWTR_EL2_PMCCFILTR_EL0 | + HDFGWTR_EL2_PMEVTYPERn_EL0 | + HDFGWTR_EL2_PMEVCNTRn_EL0, + FEAT_PMUv3), + NEEDS_FEAT(HDFGWTR_EL2_TRBTRG_EL1 | + HDFGWTR_EL2_TRBSR_EL1 | + HDFGWTR_EL2_TRBPTR_EL1 | + HDFGWTR_EL2_TRBMAR_EL1 | + HDFGWTR_EL2_TRBLIMITR_EL1 | + HDFGWTR_EL2_TRBBASER_EL1, + FEAT_TRBE), + NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSDLR_EL1, + NEVER_FGU, FEAT_DoubleLock), + NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSECCR_EL1 | + HDFGWTR_EL2_OSLAR_EL1 | + HDFGWTR_EL2_DBGPRCR_EL1 | + HDFGWTR_EL2_DBGCLAIM | + HDFGWTR_EL2_MDSCR_EL1 | + HDFGWTR_EL2_DBGWVRn_EL1 | + HDFGWTR_EL2_DBGWCRn_EL1 | + HDFGWTR_EL2_DBGBVRn_EL1 | + HDFGWTR_EL2_DBGBCRn_EL1, + NEVER_FGU, FEAT_AA64EL1), + NEEDS_FEAT(HDFGWTR_EL2_TRFCR_EL1, FEAT_TRF), +}; + +static const DECLARE_FEAT_MAP_FGT(hdfgwtr_desc, hdfgwtr_masks, + hdfgwtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hfgitr_feat_map[] = { + NEEDS_FEAT(HFGITR_EL2_PSBCSYNC, FEAT_SPEv1p5), + NEEDS_FEAT(HFGITR_EL2_ATS1E1A, FEAT_ATS1A), + NEEDS_FEAT(HFGITR_EL2_COSPRCTX, FEAT_SPECRES2), + NEEDS_FEAT(HFGITR_EL2_nGCSEPP | + HFGITR_EL2_nGCSSTR_EL1 | + HFGITR_EL2_nGCSPUSHM_EL1, + FEAT_GCS), + NEEDS_FEAT(HFGITR_EL2_nBRBIALL | + HFGITR_EL2_nBRBINJ, + FEAT_BRBE), + NEEDS_FEAT(HFGITR_EL2_CPPRCTX | + HFGITR_EL2_DVPRCTX | + HFGITR_EL2_CFPRCTX, + FEAT_SPECRES), + NEEDS_FEAT(HFGITR_EL2_TLBIRVAALE1 | + HFGITR_EL2_TLBIRVALE1 | + HFGITR_EL2_TLBIRVAAE1 | + HFGITR_EL2_TLBIRVAE1 | + HFGITR_EL2_TLBIRVAALE1IS | + HFGITR_EL2_TLBIRVALE1IS | + HFGITR_EL2_TLBIRVAAE1IS | + HFGITR_EL2_TLBIRVAE1IS | + HFGITR_EL2_TLBIRVAALE1OS | + HFGITR_EL2_TLBIRVALE1OS | + HFGITR_EL2_TLBIRVAAE1OS | + HFGITR_EL2_TLBIRVAE1OS, + FEAT_TLBIRANGE), + NEEDS_FEAT(HFGITR_EL2_TLBIVAALE1OS | + HFGITR_EL2_TLBIVALE1OS | + HFGITR_EL2_TLBIVAAE1OS | + HFGITR_EL2_TLBIASIDE1OS | + HFGITR_EL2_TLBIVAE1OS | + HFGITR_EL2_TLBIVMALLE1OS, + FEAT_TLBIOS), + NEEDS_FEAT(HFGITR_EL2_ATS1E1WP | + HFGITR_EL2_ATS1E1RP, + FEAT_PAN2), + NEEDS_FEAT(HFGITR_EL2_DCCVADP, FEAT_DPB2), + NEEDS_FEAT_FLAG(HFGITR_EL2_DCCVAC | + HFGITR_EL2_SVC_EL1 | + HFGITR_EL2_SVC_EL0 | + HFGITR_EL2_ERET | + HFGITR_EL2_TLBIVAALE1 | + HFGITR_EL2_TLBIVALE1 | + HFGITR_EL2_TLBIVAAE1 | + HFGITR_EL2_TLBIASIDE1 | + HFGITR_EL2_TLBIVAE1 | + HFGITR_EL2_TLBIVMALLE1 | + HFGITR_EL2_TLBIVAALE1IS | + HFGITR_EL2_TLBIVALE1IS | + HFGITR_EL2_TLBIVAAE1IS | + HFGITR_EL2_TLBIASIDE1IS | + HFGITR_EL2_TLBIVAE1IS | + HFGITR_EL2_TLBIVMALLE1IS| + HFGITR_EL2_ATS1E0W | + HFGITR_EL2_ATS1E0R | + HFGITR_EL2_ATS1E1W | + HFGITR_EL2_ATS1E1R | + HFGITR_EL2_DCZVA | + HFGITR_EL2_DCCIVAC | + HFGITR_EL2_DCCVAP | + HFGITR_EL2_DCCVAU | + HFGITR_EL2_DCCISW | + HFGITR_EL2_DCCSW | + HFGITR_EL2_DCISW | + HFGITR_EL2_DCIVAC | + HFGITR_EL2_ICIVAU | + HFGITR_EL2_ICIALLU | + HFGITR_EL2_ICIALLUIS, + NEVER_FGU, FEAT_AA64EL1), +}; + +static const DECLARE_FEAT_MAP_FGT(hfgitr_desc, hfgitr_masks, + hfgitr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hafgrtr_feat_map[] = { + NEEDS_FEAT(HAFGRTR_EL2_AMEVTYPER115_EL0 | + HAFGRTR_EL2_AMEVTYPER114_EL0 | + HAFGRTR_EL2_AMEVTYPER113_EL0 | + HAFGRTR_EL2_AMEVTYPER112_EL0 | + HAFGRTR_EL2_AMEVTYPER111_EL0 | + HAFGRTR_EL2_AMEVTYPER110_EL0 | + HAFGRTR_EL2_AMEVTYPER19_EL0 | + HAFGRTR_EL2_AMEVTYPER18_EL0 | + HAFGRTR_EL2_AMEVTYPER17_EL0 | + HAFGRTR_EL2_AMEVTYPER16_EL0 | + HAFGRTR_EL2_AMEVTYPER15_EL0 | + HAFGRTR_EL2_AMEVTYPER14_EL0 | + HAFGRTR_EL2_AMEVTYPER13_EL0 | + HAFGRTR_EL2_AMEVTYPER12_EL0 | + HAFGRTR_EL2_AMEVTYPER11_EL0 | + HAFGRTR_EL2_AMEVTYPER10_EL0 | + HAFGRTR_EL2_AMEVCNTR115_EL0 | + HAFGRTR_EL2_AMEVCNTR114_EL0 | + HAFGRTR_EL2_AMEVCNTR113_EL0 | + HAFGRTR_EL2_AMEVCNTR112_EL0 | + HAFGRTR_EL2_AMEVCNTR111_EL0 | + HAFGRTR_EL2_AMEVCNTR110_EL0 | + HAFGRTR_EL2_AMEVCNTR19_EL0 | + HAFGRTR_EL2_AMEVCNTR18_EL0 | + HAFGRTR_EL2_AMEVCNTR17_EL0 | + HAFGRTR_EL2_AMEVCNTR16_EL0 | + HAFGRTR_EL2_AMEVCNTR15_EL0 | + HAFGRTR_EL2_AMEVCNTR14_EL0 | + HAFGRTR_EL2_AMEVCNTR13_EL0 | + HAFGRTR_EL2_AMEVCNTR12_EL0 | + HAFGRTR_EL2_AMEVCNTR11_EL0 | + HAFGRTR_EL2_AMEVCNTR10_EL0 | + HAFGRTR_EL2_AMCNTEN1 | + HAFGRTR_EL2_AMCNTEN0 | + HAFGRTR_EL2_AMEVCNTR03_EL0 | + HAFGRTR_EL2_AMEVCNTR02_EL0 | + HAFGRTR_EL2_AMEVCNTR01_EL0 | + HAFGRTR_EL2_AMEVCNTR00_EL0, + FEAT_AMUv1), +}; + +static const DECLARE_FEAT_MAP_FGT(hafgrtr_desc, hafgrtr_masks, + hafgrtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hfgitr2_feat_map[] = { + NEEDS_FEAT(HFGITR2_EL2_nDCCIVAPS, FEAT_PoPS), + NEEDS_FEAT(HFGITR2_EL2_TSBCSYNC, FEAT_TRBEv1p1) +}; + +static const DECLARE_FEAT_MAP_FGT(hfgitr2_desc, hfgitr2_masks, + hfgitr2_feat_map, FEAT_FGT2); + +static const struct reg_bits_to_feat_map hfgrtr2_feat_map[] = { + NEEDS_FEAT(HFGRTR2_EL2_nPFAR_EL1, FEAT_PFAR), + NEEDS_FEAT(HFGRTR2_EL2_nERXGSR_EL1, FEAT_RASv2), + NEEDS_FEAT(HFGRTR2_EL2_nACTLRALIAS_EL1 | + HFGRTR2_EL2_nACTLRMASK_EL1 | + HFGRTR2_EL2_nCPACRALIAS_EL1 | + HFGRTR2_EL2_nCPACRMASK_EL1 | + HFGRTR2_EL2_nSCTLR2MASK_EL1 | + HFGRTR2_EL2_nSCTLRALIAS2_EL1 | + HFGRTR2_EL2_nSCTLRALIAS_EL1 | + HFGRTR2_EL2_nSCTLRMASK_EL1 | + HFGRTR2_EL2_nTCR2ALIAS_EL1 | + HFGRTR2_EL2_nTCR2MASK_EL1 | + HFGRTR2_EL2_nTCRALIAS_EL1 | + HFGRTR2_EL2_nTCRMASK_EL1, + FEAT_SRMASK), + NEEDS_FEAT(HFGRTR2_EL2_nRCWSMASK_EL1, FEAT_THE), +}; + +static const DECLARE_FEAT_MAP_FGT(hfgrtr2_desc, hfgrtr2_masks, + hfgrtr2_feat_map, FEAT_FGT2); + +static const struct reg_bits_to_feat_map hfgwtr2_feat_map[] = { + NEEDS_FEAT(HFGWTR2_EL2_nPFAR_EL1, FEAT_PFAR), + NEEDS_FEAT(HFGWTR2_EL2_nACTLRALIAS_EL1 | + HFGWTR2_EL2_nACTLRMASK_EL1 | + HFGWTR2_EL2_nCPACRALIAS_EL1 | + HFGWTR2_EL2_nCPACRMASK_EL1 | + HFGWTR2_EL2_nSCTLR2MASK_EL1 | + HFGWTR2_EL2_nSCTLRALIAS2_EL1 | + HFGWTR2_EL2_nSCTLRALIAS_EL1 | + HFGWTR2_EL2_nSCTLRMASK_EL1 | + HFGWTR2_EL2_nTCR2ALIAS_EL1 | + HFGWTR2_EL2_nTCR2MASK_EL1 | + HFGWTR2_EL2_nTCRALIAS_EL1 | + HFGWTR2_EL2_nTCRMASK_EL1, + FEAT_SRMASK), + NEEDS_FEAT(HFGWTR2_EL2_nRCWSMASK_EL1, FEAT_THE), +}; + +static const DECLARE_FEAT_MAP_FGT(hfgwtr2_desc, hfgwtr2_masks, + hfgwtr2_feat_map, FEAT_FGT2); + +static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = { + NEEDS_FEAT(HDFGRTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9), + NEEDS_FEAT(HDFGRTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss), + NEEDS_FEAT(HDFGRTR2_EL2_nTRCITECR_EL1, FEAT_ITE), + NEEDS_FEAT(HDFGRTR2_EL2_nPMICFILTR_EL0 | + HDFGRTR2_EL2_nPMICNTR_EL0, + FEAT_PMUv3_ICNTR), + NEEDS_FEAT(HDFGRTR2_EL2_nPMUACR_EL1, feat_pmuv3p9), + NEEDS_FEAT(HDFGRTR2_EL2_nPMSSCR_EL1 | + HDFGRTR2_EL2_nPMSSDATA, + FEAT_PMUv3_SS), + NEEDS_FEAT(HDFGRTR2_EL2_nPMIAR_EL1, FEAT_SEBEP), + NEEDS_FEAT(HDFGRTR2_EL2_nPMSDSFR_EL1, feat_spe_fds), + NEEDS_FEAT(HDFGRTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM), + NEEDS_FEAT(HDFGRTR2_EL2_nSPMACCESSR_EL1 | + HDFGRTR2_EL2_nSPMCNTEN | + HDFGRTR2_EL2_nSPMCR_EL0 | + HDFGRTR2_EL2_nSPMDEVAFF_EL1 | + HDFGRTR2_EL2_nSPMEVCNTRn_EL0 | + HDFGRTR2_EL2_nSPMEVTYPERn_EL0| + HDFGRTR2_EL2_nSPMID | + HDFGRTR2_EL2_nSPMINTEN | + HDFGRTR2_EL2_nSPMOVS | + HDFGRTR2_EL2_nSPMSCR_EL1 | + HDFGRTR2_EL2_nSPMSELR_EL0, + FEAT_SPMU), + NEEDS_FEAT(HDFGRTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2), + NEEDS_FEAT(HDFGRTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam), +}; + +static const DECLARE_FEAT_MAP_FGT(hdfgrtr2_desc, hdfgrtr2_masks, + hdfgrtr2_feat_map, FEAT_FGT2); + +static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = { + NEEDS_FEAT(HDFGWTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9), + NEEDS_FEAT(HDFGWTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss), + NEEDS_FEAT(HDFGWTR2_EL2_nTRCITECR_EL1, FEAT_ITE), + NEEDS_FEAT(HDFGWTR2_EL2_nPMICFILTR_EL0 | + HDFGWTR2_EL2_nPMICNTR_EL0, + FEAT_PMUv3_ICNTR), + NEEDS_FEAT(HDFGWTR2_EL2_nPMUACR_EL1 | + HDFGWTR2_EL2_nPMZR_EL0, + feat_pmuv3p9), + NEEDS_FEAT(HDFGWTR2_EL2_nPMSSCR_EL1, FEAT_PMUv3_SS), + NEEDS_FEAT(HDFGWTR2_EL2_nPMIAR_EL1, FEAT_SEBEP), + NEEDS_FEAT(HDFGWTR2_EL2_nPMSDSFR_EL1, feat_spe_fds), + NEEDS_FEAT(HDFGWTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM), + NEEDS_FEAT(HDFGWTR2_EL2_nSPMACCESSR_EL1 | + HDFGWTR2_EL2_nSPMCNTEN | + HDFGWTR2_EL2_nSPMCR_EL0 | + HDFGWTR2_EL2_nSPMEVCNTRn_EL0 | + HDFGWTR2_EL2_nSPMEVTYPERn_EL0| + HDFGWTR2_EL2_nSPMINTEN | + HDFGWTR2_EL2_nSPMOVS | + HDFGWTR2_EL2_nSPMSCR_EL1 | + HDFGWTR2_EL2_nSPMSELR_EL0, + FEAT_SPMU), + NEEDS_FEAT(HDFGWTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2), + NEEDS_FEAT(HDFGWTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam), +}; + +static const DECLARE_FEAT_MAP_FGT(hdfgwtr2_desc, hdfgwtr2_masks, + hdfgwtr2_feat_map, FEAT_FGT2); + + +static const struct reg_bits_to_feat_map hcrx_feat_map[] = { + NEEDS_FEAT(HCRX_EL2_PACMEn, feat_pauth_lr), + NEEDS_FEAT(HCRX_EL2_EnFPM, FEAT_FPMR), + NEEDS_FEAT(HCRX_EL2_GCSEn, FEAT_GCS), + NEEDS_FEAT(HCRX_EL2_EnIDCP128, FEAT_SYSREG128), + NEEDS_FEAT(HCRX_EL2_EnSDERR, feat_aderr), + NEEDS_FEAT(HCRX_EL2_TMEA, FEAT_DoubleFault2), + NEEDS_FEAT(HCRX_EL2_EnSNERR, feat_anerr), + NEEDS_FEAT(HCRX_EL2_D128En, FEAT_D128), + NEEDS_FEAT(HCRX_EL2_PTTWI, FEAT_THE), + NEEDS_FEAT(HCRX_EL2_SCTLR2En, FEAT_SCTLR2), + NEEDS_FEAT(HCRX_EL2_TCR2En, FEAT_TCR2), + NEEDS_FEAT(HCRX_EL2_MSCEn | + HCRX_EL2_MCE2, + FEAT_MOPS), + NEEDS_FEAT(HCRX_EL2_CMOW, FEAT_CMOW), + NEEDS_FEAT(HCRX_EL2_VFNMI | + HCRX_EL2_VINMI | + HCRX_EL2_TALLINT, + FEAT_NMI), + NEEDS_FEAT(HCRX_EL2_SMPME, feat_sme_smps), + NEEDS_FEAT(HCRX_EL2_FGTnXS | + HCRX_EL2_FnXS, + FEAT_XS), + NEEDS_FEAT(HCRX_EL2_EnASR, FEAT_LS64_V), + NEEDS_FEAT(HCRX_EL2_EnALS, FEAT_LS64), + NEEDS_FEAT(HCRX_EL2_EnAS0, FEAT_LS64_ACCDATA), +}; + + +static const DECLARE_FEAT_MAP(hcrx_desc, __HCRX_EL2, + hcrx_feat_map, FEAT_HCX); + +static const struct reg_bits_to_feat_map hcr_feat_map[] = { + NEEDS_FEAT(HCR_EL2_TID0, FEAT_AA32EL0), + NEEDS_FEAT_FIXED(HCR_EL2_RW, compute_hcr_rw), + NEEDS_FEAT(HCR_EL2_HCD, not_feat_aa64el3), + NEEDS_FEAT(HCR_EL2_AMO | + HCR_EL2_BSU | + HCR_EL2_CD | + HCR_EL2_DC | + HCR_EL2_FB | + HCR_EL2_FMO | + HCR_EL2_ID | + HCR_EL2_IMO | + HCR_EL2_MIOCNCE | + HCR_EL2_PTW | + HCR_EL2_SWIO | + HCR_EL2_TACR | + HCR_EL2_TDZ | + HCR_EL2_TGE | + HCR_EL2_TID1 | + HCR_EL2_TID2 | + HCR_EL2_TID3 | + HCR_EL2_TIDCP | + HCR_EL2_TPCP | + HCR_EL2_TPU | + HCR_EL2_TRVM | + HCR_EL2_TSC | + HCR_EL2_TSW | + HCR_EL2_TTLB | + HCR_EL2_TVM | + HCR_EL2_TWE | + HCR_EL2_TWI | + HCR_EL2_VF | + HCR_EL2_VI | + HCR_EL2_VM | + HCR_EL2_VSE, + FEAT_AA64EL1), + NEEDS_FEAT(HCR_EL2_AMVOFFEN, FEAT_AMUv1p1), + NEEDS_FEAT(HCR_EL2_EnSCXT, feat_csv2_2_csv2_1p2), + NEEDS_FEAT(HCR_EL2_TICAB | + HCR_EL2_TID4 | + HCR_EL2_TOCU, + FEAT_EVT), + NEEDS_FEAT(HCR_EL2_TTLBIS | + HCR_EL2_TTLBOS, + FEAT_EVT_TTLBxS), + NEEDS_FEAT(HCR_EL2_TLOR, FEAT_LOR), + NEEDS_FEAT(HCR_EL2_ATA | + HCR_EL2_DCT | + HCR_EL2_TID5, + FEAT_MTE2), + NEEDS_FEAT(HCR_EL2_AT | /* Ignore the original FEAT_NV */ + HCR_EL2_NV2 | + HCR_EL2_NV, + feat_nv2), + NEEDS_FEAT(HCR_EL2_NV1, feat_nv2_e2h0_ni), /* Missing from JSON */ + NEEDS_FEAT(HCR_EL2_API | + HCR_EL2_APK, + feat_pauth), + NEEDS_FEAT(HCR_EL2_TEA | + HCR_EL2_TERR, + FEAT_RAS), + NEEDS_FEAT(HCR_EL2_FIEN, feat_rasv1p1), + NEEDS_FEAT(HCR_EL2_GPF, FEAT_RME), + NEEDS_FEAT(HCR_EL2_FWB, FEAT_S2FWB), + NEEDS_FEAT(HCR_EL2_TME, FEAT_TME), + NEEDS_FEAT(HCR_EL2_TWEDEL | + HCR_EL2_TWEDEn, + FEAT_TWED), + NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h), +}; + +static const DECLARE_FEAT_MAP(hcr_desc, HCR_EL2, + hcr_feat_map, FEAT_AA64EL2); + +static const struct reg_bits_to_feat_map sctlr2_feat_map[] = { + NEEDS_FEAT(SCTLR2_EL1_NMEA | + SCTLR2_EL1_EASE, + FEAT_DoubleFault2), + NEEDS_FEAT(SCTLR2_EL1_EnADERR, feat_aderr), + NEEDS_FEAT(SCTLR2_EL1_EnANERR, feat_anerr), + NEEDS_FEAT(SCTLR2_EL1_EnIDCP128, FEAT_SYSREG128), + NEEDS_FEAT(SCTLR2_EL1_EnPACM | + SCTLR2_EL1_EnPACM0, + feat_pauth_lr), + NEEDS_FEAT(SCTLR2_EL1_CPTA | + SCTLR2_EL1_CPTA0 | + SCTLR2_EL1_CPTM | + SCTLR2_EL1_CPTM0, + FEAT_CPA2), +}; + +static const DECLARE_FEAT_MAP(sctlr2_desc, SCTLR2_EL1, + sctlr2_feat_map, FEAT_SCTLR2); + +static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = { + NEEDS_FEAT(TCR2_EL2_FNG1 | + TCR2_EL2_FNG0 | + TCR2_EL2_A2, + feat_asid2_e2h1), + NEEDS_FEAT(TCR2_EL2_DisCH1 | + TCR2_EL2_DisCH0 | + TCR2_EL2_D128, + feat_d128_e2h1), + NEEDS_FEAT(TCR2_EL2_AMEC1, feat_mec_e2h1), + NEEDS_FEAT(TCR2_EL2_AMEC0, FEAT_MEC), + NEEDS_FEAT(TCR2_EL2_HAFT, FEAT_HAFT), + NEEDS_FEAT(TCR2_EL2_PTTWI | + TCR2_EL2_PnCH, + FEAT_THE), + NEEDS_FEAT(TCR2_EL2_AIE, FEAT_AIE), + NEEDS_FEAT(TCR2_EL2_POE | + TCR2_EL2_E0POE, + FEAT_S1POE), + NEEDS_FEAT(TCR2_EL2_PIE, FEAT_S1PIE), +}; + +static const DECLARE_FEAT_MAP(tcr2_el2_desc, TCR2_EL2, + tcr2_el2_feat_map, FEAT_TCR2); + +static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { + NEEDS_FEAT(SCTLR_EL1_CP15BEN | + SCTLR_EL1_ITD | + SCTLR_EL1_SED, + FEAT_AA32EL0), + NEEDS_FEAT(SCTLR_EL1_BT0 | + SCTLR_EL1_BT1, + FEAT_BTI), + NEEDS_FEAT(SCTLR_EL1_CMOW, FEAT_CMOW), + NEEDS_FEAT(SCTLR_EL1_TSCXT, feat_csv2_2_csv2_1p2), + NEEDS_FEAT(SCTLR_EL1_EIS | + SCTLR_EL1_EOS, + FEAT_ExS), + NEEDS_FEAT(SCTLR_EL1_EnFPM, FEAT_FPMR), + NEEDS_FEAT(SCTLR_EL1_IESB, FEAT_IESB), + NEEDS_FEAT(SCTLR_EL1_EnALS, FEAT_LS64), + NEEDS_FEAT(SCTLR_EL1_EnAS0, FEAT_LS64_ACCDATA), + NEEDS_FEAT(SCTLR_EL1_EnASR, FEAT_LS64_V), + NEEDS_FEAT(SCTLR_EL1_nAA, FEAT_LSE2), + NEEDS_FEAT(SCTLR_EL1_LSMAOE | + SCTLR_EL1_nTLSMD, + FEAT_LSMAOC), + NEEDS_FEAT(SCTLR_EL1_EE, FEAT_MixedEnd), + NEEDS_FEAT(SCTLR_EL1_E0E, feat_mixedendel0), + NEEDS_FEAT(SCTLR_EL1_MSCEn, FEAT_MOPS), + NEEDS_FEAT(SCTLR_EL1_ATA0 | + SCTLR_EL1_ATA | + SCTLR_EL1_TCF0 | + SCTLR_EL1_TCF, + FEAT_MTE2), + NEEDS_FEAT(SCTLR_EL1_ITFSB, feat_mte_async), + NEEDS_FEAT(SCTLR_EL1_TCSO0 | + SCTLR_EL1_TCSO, + FEAT_MTE_STORE_ONLY), + NEEDS_FEAT(SCTLR_EL1_NMI | + SCTLR_EL1_SPINTMASK, + FEAT_NMI), + NEEDS_FEAT(SCTLR_EL1_SPAN, FEAT_PAN), + NEEDS_FEAT(SCTLR_EL1_EPAN, FEAT_PAN3), + NEEDS_FEAT(SCTLR_EL1_EnDA | + SCTLR_EL1_EnDB | + SCTLR_EL1_EnIA | + SCTLR_EL1_EnIB, + feat_pauth), + NEEDS_FEAT(SCTLR_EL1_EnTP2, FEAT_SME), + NEEDS_FEAT(SCTLR_EL1_EnRCTX, FEAT_SPECRES), + NEEDS_FEAT(SCTLR_EL1_DSSBS, FEAT_SSBS), + NEEDS_FEAT(SCTLR_EL1_TIDCP, FEAT_TIDCP1), + NEEDS_FEAT(SCTLR_EL1_TME0 | + SCTLR_EL1_TME | + SCTLR_EL1_TMT0 | + SCTLR_EL1_TMT, + FEAT_TME), + NEEDS_FEAT(SCTLR_EL1_TWEDEL | + SCTLR_EL1_TWEDEn, + FEAT_TWED), + NEEDS_FEAT(SCTLR_EL1_UCI | + SCTLR_EL1_EE | + SCTLR_EL1_E0E | + SCTLR_EL1_WXN | + SCTLR_EL1_nTWE | + SCTLR_EL1_nTWI | + SCTLR_EL1_UCT | + SCTLR_EL1_DZE | + SCTLR_EL1_I | + SCTLR_EL1_UMA | + SCTLR_EL1_SA0 | + SCTLR_EL1_SA | + SCTLR_EL1_C | + SCTLR_EL1_A | + SCTLR_EL1_M, + FEAT_AA64EL1), +}; + +static const DECLARE_FEAT_MAP(sctlr_el1_desc, SCTLR_EL1, + sctlr_el1_feat_map, FEAT_AA64EL1); + +static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = { + NEEDS_FEAT(MDCR_EL2_EBWE, FEAT_Debugv8p9), + NEEDS_FEAT(MDCR_EL2_TDOSA, FEAT_DoubleLock), + NEEDS_FEAT(MDCR_EL2_PMEE, FEAT_EBEP), + NEEDS_FEAT(MDCR_EL2_TDCC, FEAT_FGT), + NEEDS_FEAT(MDCR_EL2_MTPME, FEAT_MTPMU), + NEEDS_FEAT(MDCR_EL2_HPME | + MDCR_EL2_HPMN | + MDCR_EL2_TPMCR | + MDCR_EL2_TPM, + FEAT_PMUv3), + NEEDS_FEAT(MDCR_EL2_HPMD, feat_pmuv3p1), + NEEDS_FEAT(MDCR_EL2_HCCD | + MDCR_EL2_HLP, + feat_pmuv3p5), + NEEDS_FEAT(MDCR_EL2_HPMFZO, feat_pmuv3p7), + NEEDS_FEAT(MDCR_EL2_PMSSE, FEAT_PMUv3_SS), + NEEDS_FEAT(MDCR_EL2_E2PB | + MDCR_EL2_TPMS, + FEAT_SPE), + NEEDS_FEAT(MDCR_EL2_HPMFZS, FEAT_SPEv1p2), + NEEDS_FEAT(MDCR_EL2_EnSPM, FEAT_SPMU), + NEEDS_FEAT(MDCR_EL2_EnSTEPOP, FEAT_STEP2), + NEEDS_FEAT(MDCR_EL2_E2TB, FEAT_TRBE), + NEEDS_FEAT(MDCR_EL2_TTRF, FEAT_TRF), + NEEDS_FEAT(MDCR_EL2_TDA | + MDCR_EL2_TDE | + MDCR_EL2_TDRA, + FEAT_AA64EL1), +}; + +static const DECLARE_FEAT_MAP(mdcr_el2_desc, MDCR_EL2, + mdcr_el2_feat_map, FEAT_AA64EL2); + +static void __init check_feat_map(const struct reg_bits_to_feat_map *map, + int map_size, u64 res0, const char *str) +{ + u64 mask = 0; + + for (int i = 0; i < map_size; i++) + mask |= map[i].bits; + + if (mask != ~res0) + kvm_err("Undefined %s behaviour, bits %016llx\n", + str, mask ^ ~res0); +} + +static u64 reg_feat_map_bits(const struct reg_bits_to_feat_map *map) +{ + return map->flags & RES0_POINTER ? ~(*map->res0p) : map->bits; +} + +static void __init check_reg_desc(const struct reg_feat_map_desc *r) +{ + check_feat_map(r->bit_feat_map, r->bit_feat_map_sz, + ~reg_feat_map_bits(&r->feat_map), r->name); +} + +void __init check_feature_map(void) +{ + check_reg_desc(&hfgrtr_desc); + check_reg_desc(&hfgwtr_desc); + check_reg_desc(&hfgitr_desc); + check_reg_desc(&hdfgrtr_desc); + check_reg_desc(&hdfgwtr_desc); + check_reg_desc(&hafgrtr_desc); + check_reg_desc(&hfgrtr2_desc); + check_reg_desc(&hfgwtr2_desc); + check_reg_desc(&hfgitr2_desc); + check_reg_desc(&hdfgrtr2_desc); + check_reg_desc(&hdfgwtr2_desc); + check_reg_desc(&hcrx_desc); + check_reg_desc(&hcr_desc); + check_reg_desc(&sctlr2_desc); + check_reg_desc(&tcr2_el2_desc); + check_reg_desc(&sctlr_el1_desc); + check_reg_desc(&mdcr_el2_desc); +} + +static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map) +{ + u64 regval = kvm->arch.id_regs[map->regidx]; + u64 regfld = (regval >> map->shift) & GENMASK(map->width - 1, 0); + + if (map->sign) { + s64 sfld = sign_extend64(regfld, map->width - 1); + s64 slim = sign_extend64(map->lo_lim, map->width - 1); + return sfld >= slim; + } else { + return regfld >= map->lo_lim; + } +} + +static u64 __compute_fixed_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + u64 *fixed_bits, + unsigned long require, + unsigned long exclude) +{ + u64 val = 0; + + for (int i = 0; i < map_size; i++) { + bool match; + + if ((map[i].flags & require) != require) + continue; + + if (map[i].flags & exclude) + continue; + + if (map[i].flags & CALL_FUNC) + match = (map[i].flags & FIXED_VALUE) ? + map[i].fval(kvm, fixed_bits) : + map[i].match(kvm); + else + match = idreg_feat_match(kvm, &map[i]); + + if (!match || (map[i].flags & FIXED_VALUE)) + val |= reg_feat_map_bits(&map[i]); + } + + return val; +} + +static u64 compute_res0_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + unsigned long require, + unsigned long exclude) +{ + return __compute_fixed_bits(kvm, map, map_size, NULL, + require, exclude | FIXED_VALUE); +} + +static u64 compute_reg_res0_bits(struct kvm *kvm, + const struct reg_feat_map_desc *r, + unsigned long require, unsigned long exclude) + +{ + u64 res0; + + res0 = compute_res0_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, + require, exclude); + + /* + * If computing FGUs, don't take RES0 or register existence + * into account -- we're not computing bits for the register + * itself. + */ + if (!(exclude & NEVER_FGU)) { + res0 |= compute_res0_bits(kvm, &r->feat_map, 1, require, exclude); + res0 |= ~reg_feat_map_bits(&r->feat_map); + } + + return res0; +} + +static u64 compute_reg_fixed_bits(struct kvm *kvm, + const struct reg_feat_map_desc *r, + u64 *fixed_bits, unsigned long require, + unsigned long exclude) +{ + return __compute_fixed_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, + fixed_bits, require | FIXED_VALUE, exclude); +} + +void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) +{ + u64 val = 0; + + switch (fgt) { + case HFGRTR_GROUP: + val |= compute_reg_res0_bits(kvm, &hfgrtr_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgwtr_desc, + 0, NEVER_FGU); + break; + case HFGITR_GROUP: + val |= compute_reg_res0_bits(kvm, &hfgitr_desc, + 0, NEVER_FGU); + break; + case HDFGRTR_GROUP: + val |= compute_reg_res0_bits(kvm, &hdfgrtr_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hdfgwtr_desc, + 0, NEVER_FGU); + break; + case HAFGRTR_GROUP: + val |= compute_reg_res0_bits(kvm, &hafgrtr_desc, + 0, NEVER_FGU); + break; + case HFGRTR2_GROUP: + val |= compute_reg_res0_bits(kvm, &hfgrtr2_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgwtr2_desc, + 0, NEVER_FGU); + break; + case HFGITR2_GROUP: + val |= compute_reg_res0_bits(kvm, &hfgitr2_desc, + 0, NEVER_FGU); + break; + case HDFGRTR2_GROUP: + val |= compute_reg_res0_bits(kvm, &hdfgrtr2_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hdfgwtr2_desc, + 0, NEVER_FGU); + break; + default: + BUG(); + } + + kvm->arch.fgu[fgt] = val; +} + +void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1) +{ + u64 fixed = 0, mask; + + switch (reg) { + case HFGRTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgrtr_desc, 0, 0); + *res1 = HFGRTR_EL2_RES1; + break; + case HFGWTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgwtr_desc, 0, 0); + *res1 = HFGWTR_EL2_RES1; + break; + case HFGITR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgitr_desc, 0, 0); + *res1 = HFGITR_EL2_RES1; + break; + case HDFGRTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hdfgrtr_desc, 0, 0); + *res1 = HDFGRTR_EL2_RES1; + break; + case HDFGWTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hdfgwtr_desc, 0, 0); + *res1 = HDFGWTR_EL2_RES1; + break; + case HAFGRTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hafgrtr_desc, 0, 0); + *res1 = HAFGRTR_EL2_RES1; + break; + case HFGRTR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgrtr2_desc, 0, 0); + *res1 = HFGRTR2_EL2_RES1; + break; + case HFGWTR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgwtr2_desc, 0, 0); + *res1 = HFGWTR2_EL2_RES1; + break; + case HFGITR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgitr2_desc, 0, 0); + *res1 = HFGITR2_EL2_RES1; + break; + case HDFGRTR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hdfgrtr2_desc, 0, 0); + *res1 = HDFGRTR2_EL2_RES1; + break; + case HDFGWTR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hdfgwtr2_desc, 0, 0); + *res1 = HDFGWTR2_EL2_RES1; + break; + case HCRX_EL2: + *res0 = compute_reg_res0_bits(kvm, &hcrx_desc, 0, 0); + *res1 = __HCRX_EL2_RES1; + break; + case HCR_EL2: + mask = compute_reg_fixed_bits(kvm, &hcr_desc, &fixed, 0, 0); + *res0 = compute_reg_res0_bits(kvm, &hcr_desc, 0, 0); + *res0 |= (mask & ~fixed); + *res1 = HCR_EL2_RES1 | (mask & fixed); + break; + case SCTLR2_EL1: + case SCTLR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &sctlr2_desc, 0, 0); + *res1 = SCTLR2_EL1_RES1; + break; + case TCR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &tcr2_el2_desc, 0, 0); + *res1 = TCR2_EL2_RES1; + break; + case SCTLR_EL1: + *res0 = compute_reg_res0_bits(kvm, &sctlr_el1_desc, 0, 0); + *res1 = SCTLR_EL1_RES1; + break; + case MDCR_EL2: + *res0 = compute_reg_res0_bits(kvm, &mdcr_el2_desc, 0, 0); + *res1 = MDCR_EL2_RES1; + break; + default: + WARN_ON_ONCE(1); + *res0 = *res1 = 0; + break; + } +} + +static __always_inline struct fgt_masks *__fgt_reg_to_masks(enum vcpu_sysreg reg) +{ + switch (reg) { + case HFGRTR_EL2: + return &hfgrtr_masks; + case HFGWTR_EL2: + return &hfgwtr_masks; + case HFGITR_EL2: + return &hfgitr_masks; + case HDFGRTR_EL2: + return &hdfgrtr_masks; + case HDFGWTR_EL2: + return &hdfgwtr_masks; + case HAFGRTR_EL2: + return &hafgrtr_masks; + case HFGRTR2_EL2: + return &hfgrtr2_masks; + case HFGWTR2_EL2: + return &hfgwtr2_masks; + case HFGITR2_EL2: + return &hfgitr2_masks; + case HDFGRTR2_EL2: + return &hdfgrtr2_masks; + case HDFGWTR2_EL2: + return &hdfgwtr2_masks; + default: + BUILD_BUG_ON(1); + } +} + +static __always_inline void __compute_fgt(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg) +{ + u64 fgu = vcpu->kvm->arch.fgu[__fgt_reg_to_group_id(reg)]; + struct fgt_masks *m = __fgt_reg_to_masks(reg); + u64 clear = 0, set = 0, val = m->nmask; + + set |= fgu & m->mask; + clear |= fgu & m->nmask; + + if (is_nested_ctxt(vcpu)) { + u64 nested = __vcpu_sys_reg(vcpu, reg); + set |= nested & m->mask; + clear |= ~nested & m->nmask; + } + + val |= set; + val &= ~clear; + *vcpu_fgt(vcpu, reg) = val; +} + +static void __compute_hfgwtr(struct kvm_vcpu *vcpu) +{ + __compute_fgt(vcpu, HFGWTR_EL2); + + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + *vcpu_fgt(vcpu, HFGWTR_EL2) |= HFGWTR_EL2_TCR_EL1; +} + +static void __compute_hdfgwtr(struct kvm_vcpu *vcpu) +{ + __compute_fgt(vcpu, HDFGWTR_EL2); + + if (is_hyp_ctxt(vcpu)) + *vcpu_fgt(vcpu, HDFGWTR_EL2) |= HDFGWTR_EL2_MDSCR_EL1; +} + +void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu) +{ + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + return; + + __compute_fgt(vcpu, HFGRTR_EL2); + __compute_hfgwtr(vcpu); + __compute_fgt(vcpu, HFGITR_EL2); + __compute_fgt(vcpu, HDFGRTR_EL2); + __compute_hdfgwtr(vcpu); + __compute_fgt(vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __compute_fgt(vcpu, HFGRTR2_EL2); + __compute_fgt(vcpu, HFGWTR2_EL2); + __compute_fgt(vcpu, HFGITR2_EL2); + __compute_fgt(vcpu, HDFGRTR2_EL2); + __compute_fgt(vcpu, HDFGWTR2_EL2); +} diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c new file mode 100644 index 000000000000..3ad6b7c6e4ba --- /dev/null +++ b/arch/arm64/kvm/debug.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Debug and Guest Debug support + * + * Copyright (C) 2015 - Linaro Ltd + * Authors: Alex Bennée <alex.bennee@linaro.org> + * Oliver Upton <oliver.upton@linux.dev> + */ + +#include <linux/kvm_host.h> +#include <linux/hw_breakpoint.h> + +#include <asm/debug-monitors.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_emulate.h> + +static int cpu_has_spe(u64 dfr0) +{ + return cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && + !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P); +} + +/** + * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value + * + * @vcpu: the vcpu pointer + * + * This ensures we will trap access to: + * - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR) + * - Debug ROM Address (MDCR_EL2_TDRA) + * - OS related registers (MDCR_EL2_TDOSA) + * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB) + * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) + * - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB) + */ +static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + + /* + * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK + * to disable guest access to the profiling and trace buffers + */ + vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, + *host_data_ptr(nr_event_counters)); + vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | + MDCR_EL2_TPMS | + MDCR_EL2_TTRF | + MDCR_EL2_TPMCR | + MDCR_EL2_TDRA | + MDCR_EL2_TDOSA); + + /* Is the VM being debugged by userspace? */ + if (vcpu->guest_debug) + /* Route all software debug exceptions to EL2 */ + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; + + /* + * Trap debug registers if the guest doesn't have ownership of them. + */ + if (!kvm_guest_owns_debug_regs(vcpu)) + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; + + if (vcpu_has_nv(vcpu)) + kvm_nested_setup_mdcr_el2(vcpu); + + /* Write MDCR_EL2 directly if we're already at EL2 */ + if (has_vhe()) + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); + + preempt_enable(); +} + +void kvm_init_host_debug_data(void) +{ + u64 dfr0 = read_sysreg(id_aa64dfr0_el1); + + if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0) + *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, + read_sysreg(pmcr_el0)); + + *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0); + *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0); + + if (cpu_has_spe(dfr0)) + host_data_set_flag(HAS_SPE); + + if (has_vhe()) + return; + + /* Check if we have BRBE implemented and available at the host */ + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT)) + host_data_set_flag(HAS_BRBE); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) { + /* Force disable trace in protected mode in case of no TRBE */ + if (is_protected_kvm_enabled()) + host_data_set_flag(EL1_TRACING_CONFIGURED); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && + !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) + host_data_set_flag(HAS_TRBE); + } +} + +void kvm_debug_init_vhe(void) +{ + /* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */ + if (host_data_test_flag(HAS_SPE)) + write_sysreg_el1(0, SYS_PMSCR); +} + +/* + * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host + * has taken over MDSCR_EL1. + * + * - Userspace is single-stepping the guest, and MDSCR_EL1.SS is forced to 1. + * + * - Userspace is using the breakpoint/watchpoint registers to debug the + * guest, and MDSCR_EL1.MDE is forced to 1. + * + * - The guest has enabled the OS Lock, and KVM is forcing MDSCR_EL1.MDE to 0, + * masking all debug exceptions affected by the OS Lock. + */ +static void setup_external_mdscr(struct kvm_vcpu *vcpu) +{ + /* + * Use the guest's MDSCR_EL1 as a starting point, since there are + * several other features controlled by MDSCR_EL1 that are not relevant + * to the host. + * + * Clear the bits that KVM may use which also satisfies emulation of + * the OS Lock as MDSCR_EL1.MDE is cleared. + */ + u64 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1) & ~(MDSCR_EL1_SS | + MDSCR_EL1_MDE | + MDSCR_EL1_KDE); + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + mdscr |= MDSCR_EL1_SS; + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + mdscr |= MDSCR_EL1_MDE | MDSCR_EL1_KDE; + + vcpu->arch.external_mdscr_el1 = mdscr; +} + +void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) +{ + u64 mdscr; + + /* Must be called before kvm_vcpu_load_vhe() */ + KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); + + if (has_vhe()) + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + + /* + * Determine which of the possible debug states we're in: + * + * - VCPU_DEBUG_HOST_OWNED: KVM has taken ownership of the guest's + * breakpoint/watchpoint registers, or needs to use MDSCR_EL1 to do + * software step or emulate the effects of the OS Lock being enabled. + * + * - VCPU_DEBUG_GUEST_OWNED: The guest has debug exceptions enabled, and + * the breakpoint/watchpoint registers need to be loaded eagerly. + * + * - VCPU_DEBUG_FREE: Neither of the above apply, no breakpoint/watchpoint + * context needs to be loaded on the CPU. + */ + if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { + vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED; + setup_external_mdscr(vcpu); + + /* + * Steal the guest's single-step state machine if userspace wants + * single-step the guest. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + if (*vcpu_cpsr(vcpu) & DBG_SPSR_SS) + vcpu_clear_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + else + vcpu_set_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + + if (!vcpu_get_flag(vcpu, HOST_SS_ACTIVE_PENDING)) + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + } + } else { + mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); + + if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE)) + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + else + vcpu->arch.debug_owner = VCPU_DEBUG_FREE; + } + + kvm_arm_setup_mdcr_el2(vcpu); +} + +void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) +{ + if (has_vhe()) + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + + if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + return; + + /* + * Save the host's software step state and restore the guest's before + * potentially returning to userspace. + */ + if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) + vcpu_set_flag(vcpu, HOST_SS_ACTIVE_PENDING); + else + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); + + if (vcpu_get_flag(vcpu, GUEST_SS_ACTIVE_PENDING)) + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; +} + +/* + * Updates ownership of the debug registers after a trapped guest access to a + * breakpoint/watchpoint register. Host ownership of the debug registers is of + * strictly higher priority, and it is the responsibility of the VMM to emulate + * guest debug exceptions in this configuration. + */ +void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) +{ + if (kvm_host_owns_debug_regs(vcpu)) + return; + + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + kvm_arm_setup_mdcr_el2(vcpu); +} + +void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) +{ + if (val & OSLAR_EL1_OSLK) + __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, |=, OSLSR_EL1_OSLK); + else + __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, &=, ~OSLSR_EL1_OSLK); + + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); +} + +static bool skip_trbe_access(bool skip_condition) +{ + return (WARN_ON_ONCE(preemptible()) || skip_condition || + is_protected_kvm_enabled() || !is_kvm_arm_initialised()); +} + +void kvm_enable_trbe(void) +{ + if (!skip_trbe_access(has_vhe())) + host_data_set_flag(TRBE_ENABLED); +} +EXPORT_SYMBOL_GPL(kvm_enable_trbe); + +void kvm_disable_trbe(void) +{ + if (!skip_trbe_access(has_vhe())) + host_data_clear_flag(TRBE_ENABLED); +} +EXPORT_SYMBOL_GPL(kvm_disable_trbe); + +void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) +{ + if (skip_trbe_access(false)) + return; + + if (has_vhe()) { + write_sysreg_s(trfcr_while_in_guest, SYS_TRFCR_EL12); + return; + } + + *host_data_ptr(trfcr_while_in_guest) = trfcr_while_in_guest; + if (read_sysreg_s(SYS_TRFCR_EL1) != trfcr_while_in_guest) + host_data_set_flag(EL1_TRACING_CONFIGURED); + else + host_data_clear_flag(EL1_TRACING_CONFIGURED); +} +EXPORT_SYMBOL_GPL(kvm_tracing_set_el1_configuration); diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c new file mode 100644 index 000000000000..834f13fb1fb7 --- /dev/null +++ b/arch/arm64/kvm/emulate-nested.c @@ -0,0 +1,2854 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 - Linaro and Columbia University + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> + +#include "hyp/include/hyp/adjust_pc.h" + +#include "trace.h" + +enum trap_behaviour { + BEHAVE_HANDLE_LOCALLY = 0, + + BEHAVE_FORWARD_READ = BIT(0), + BEHAVE_FORWARD_WRITE = BIT(1), + BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + + /* Traps that take effect in Host EL0, this is rare! */ + BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2), +}; + +struct trap_bits { + const enum vcpu_sysreg index; + const enum trap_behaviour behaviour; + const u64 value; + const u64 mask; +}; + +/* Coarse Grained Trap definitions */ +enum cgt_group_id { + /* Indicates no coarse trap control */ + __RESERVED__, + + /* + * The first batch of IDs denote coarse trapping that are used + * on their own instead of being part of a combination of + * trap controls. + */ + CGT_HCR_TID1, + CGT_HCR_TID2, + CGT_HCR_TID3, + CGT_HCR_IMO, + CGT_HCR_FMO, + CGT_HCR_TIDCP, + CGT_HCR_TACR, + CGT_HCR_TSW, + CGT_HCR_TPC, + CGT_HCR_TPU, + CGT_HCR_TTLB, + CGT_HCR_TVM, + CGT_HCR_TDZ, + CGT_HCR_TRVM, + CGT_HCR_TLOR, + CGT_HCR_TERR, + CGT_HCR_APK, + CGT_HCR_NV, + CGT_HCR_NV_nNV2, + CGT_HCR_NV1_nNV2, + CGT_HCR_AT, + CGT_HCR_nFIEN, + CGT_HCR_TID4, + CGT_HCR_TICAB, + CGT_HCR_TOCU, + CGT_HCR_ENSCXT, + CGT_HCR_TTLBIS, + CGT_HCR_TTLBOS, + + CGT_MDCR_TPMCR, + CGT_MDCR_TPM, + CGT_MDCR_TDE, + CGT_MDCR_TDA, + CGT_MDCR_TDOSA, + CGT_MDCR_TDRA, + CGT_MDCR_E2PB, + CGT_MDCR_TPMS, + CGT_MDCR_TTRF, + CGT_MDCR_E2TB, + CGT_MDCR_TDCC, + + CGT_CPTR_TAM, + CGT_CPTR_TCPAC, + + CGT_HCRX_EnFPM, + CGT_HCRX_TCR2En, + CGT_HCRX_SCTLR2En, + + CGT_CNTHCTL_EL1TVT, + CGT_CNTHCTL_EL1TVCT, + + CGT_ICH_HCR_TC, + CGT_ICH_HCR_TALL0, + CGT_ICH_HCR_TALL1, + CGT_ICH_HCR_TDIR, + + /* + * Anything after this point is a combination of coarse trap + * controls, which must all be evaluated to decide what to do. + */ + __MULTIPLE_CONTROL_BITS__, + CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__, + CGT_HCR_TID2_TID4, + CGT_HCR_TTLB_TTLBIS, + CGT_HCR_TTLB_TTLBOS, + CGT_HCR_TVM_TRVM, + CGT_HCR_TVM_TRVM_HCRX_TCR2En, + CGT_HCR_TVM_TRVM_HCRX_SCTLR2En, + CGT_HCR_TPU_TICAB, + CGT_HCR_TPU_TOCU, + CGT_HCR_NV1_nNV2_ENSCXT, + CGT_MDCR_TPM_TPMCR, + CGT_MDCR_TPM_HPMN, + CGT_MDCR_TDE_TDA, + CGT_MDCR_TDE_TDOSA, + CGT_MDCR_TDE_TDRA, + CGT_MDCR_TDCC_TDE_TDA, + + CGT_ICH_HCR_TC_TDIR, + + /* + * Anything after this point requires a callback evaluating a + * complex trap condition. Ugly stuff. + */ + __COMPLEX_CONDITIONS__, + CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__, + CGT_CNTHCTL_EL1PTEN, + CGT_CNTHCTL_EL1NVPCT, + CGT_CNTHCTL_EL1NVVCT, + + CGT_CPTR_TTA, + CGT_MDCR_HPMN, + + /* Must be last */ + __NR_CGT_GROUP_IDS__ +}; + +static const struct trap_bits coarse_trap_bits[] = { + [CGT_HCR_TID1] = { + .index = HCR_EL2, + .value = HCR_TID1, + .mask = HCR_TID1, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_HCR_TID2] = { + .index = HCR_EL2, + .value = HCR_TID2, + .mask = HCR_TID2, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TID3] = { + .index = HCR_EL2, + .value = HCR_TID3, + .mask = HCR_TID3, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_HCR_IMO] = { + .index = HCR_EL2, + .value = HCR_IMO, + .mask = HCR_IMO, + .behaviour = BEHAVE_FORWARD_WRITE, + }, + [CGT_HCR_FMO] = { + .index = HCR_EL2, + .value = HCR_FMO, + .mask = HCR_FMO, + .behaviour = BEHAVE_FORWARD_WRITE, + }, + [CGT_HCR_TIDCP] = { + .index = HCR_EL2, + .value = HCR_TIDCP, + .mask = HCR_TIDCP, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TACR] = { + .index = HCR_EL2, + .value = HCR_TACR, + .mask = HCR_TACR, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TSW] = { + .index = HCR_EL2, + .value = HCR_TSW, + .mask = HCR_TSW, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */ + .index = HCR_EL2, + .value = HCR_TPC, + .mask = HCR_TPC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TPU] = { + .index = HCR_EL2, + .value = HCR_TPU, + .mask = HCR_TPU, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TTLB] = { + .index = HCR_EL2, + .value = HCR_TTLB, + .mask = HCR_TTLB, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TVM] = { + .index = HCR_EL2, + .value = HCR_TVM, + .mask = HCR_TVM, + .behaviour = BEHAVE_FORWARD_WRITE, + }, + [CGT_HCR_TDZ] = { + .index = HCR_EL2, + .value = HCR_TDZ, + .mask = HCR_TDZ, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TRVM] = { + .index = HCR_EL2, + .value = HCR_TRVM, + .mask = HCR_TRVM, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_HCR_TLOR] = { + .index = HCR_EL2, + .value = HCR_TLOR, + .mask = HCR_TLOR, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TERR] = { + .index = HCR_EL2, + .value = HCR_TERR, + .mask = HCR_TERR, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_APK] = { + .index = HCR_EL2, + .value = 0, + .mask = HCR_APK, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_NV] = { + .index = HCR_EL2, + .value = HCR_NV, + .mask = HCR_NV, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_NV_nNV2] = { + .index = HCR_EL2, + .value = HCR_NV, + .mask = HCR_NV | HCR_NV2, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_NV1_nNV2] = { + .index = HCR_EL2, + .value = HCR_NV | HCR_NV1, + .mask = HCR_NV | HCR_NV1 | HCR_NV2, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_AT] = { + .index = HCR_EL2, + .value = HCR_AT, + .mask = HCR_AT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_nFIEN] = { + .index = HCR_EL2, + .value = 0, + .mask = HCR_FIEN, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TID4] = { + .index = HCR_EL2, + .value = HCR_TID4, + .mask = HCR_TID4, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TICAB] = { + .index = HCR_EL2, + .value = HCR_TICAB, + .mask = HCR_TICAB, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TOCU] = { + .index = HCR_EL2, + .value = HCR_TOCU, + .mask = HCR_TOCU, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_ENSCXT] = { + .index = HCR_EL2, + .value = 0, + .mask = HCR_ENSCXT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TTLBIS] = { + .index = HCR_EL2, + .value = HCR_TTLBIS, + .mask = HCR_TTLBIS, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TTLBOS] = { + .index = HCR_EL2, + .value = HCR_TTLBOS, + .mask = HCR_TTLBOS, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TPMCR] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TPMCR, + .mask = MDCR_EL2_TPMCR, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, + }, + [CGT_MDCR_TPM] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TPM, + .mask = MDCR_EL2_TPM, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, + }, + [CGT_MDCR_TDE] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDE, + .mask = MDCR_EL2_TDE, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TDA] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDA, + .mask = MDCR_EL2_TDA, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TDOSA] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDOSA, + .mask = MDCR_EL2_TDOSA, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TDRA] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDRA, + .mask = MDCR_EL2_TDRA, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_E2PB] = { + .index = MDCR_EL2, + .value = 0, + .mask = BIT(MDCR_EL2_E2PB_SHIFT), + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TPMS] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TPMS, + .mask = MDCR_EL2_TPMS, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TTRF] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TTRF, + .mask = MDCR_EL2_TTRF, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_E2TB] = { + .index = MDCR_EL2, + .value = 0, + .mask = BIT(MDCR_EL2_E2TB_SHIFT), + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TDCC] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDCC, + .mask = MDCR_EL2_TDCC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CPTR_TAM] = { + .index = CPTR_EL2, + .value = CPTR_EL2_TAM, + .mask = CPTR_EL2_TAM, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CPTR_TCPAC] = { + .index = CPTR_EL2, + .value = CPTR_EL2_TCPAC, + .mask = CPTR_EL2_TCPAC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCRX_EnFPM] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_EnFPM, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCRX_TCR2En] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_TCR2En, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCRX_SCTLR2En] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_SCTLR2En, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVT, + .mask = CNTHCTL_EL1TVT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVCT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVCT, + .mask = CNTHCTL_EL1TVCT, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_ICH_HCR_TC] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TC, + .mask = ICH_HCR_EL2_TC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TALL0] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TALL0, + .mask = ICH_HCR_EL2_TALL0, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TALL1] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TALL1, + .mask = ICH_HCR_EL2_TALL1, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TDIR] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TDIR, + .mask = ICH_HCR_EL2_TDIR, + .behaviour = BEHAVE_FORWARD_RW, + }, +}; + +#define MCB(id, ...) \ + [id - __MULTIPLE_CONTROL_BITS__] = \ + (const enum cgt_group_id[]){ \ + __VA_ARGS__, __RESERVED__ \ + } + +static const enum cgt_group_id *coarse_control_combo[] = { + MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4), + MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS), + MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS), + MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM), + MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En, + CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En), + MCB(CGT_HCR_TVM_TRVM_HCRX_SCTLR2En, + CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_SCTLR2En), + MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB), + MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU), + MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT), + MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR), + MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN), + MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA), + MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), + MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), + MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA), + + MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC), + MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR), +}; + +typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); + +/* + * Warning, maximum confusion ahead. + * + * When E2H=0, CNTHCTL_EL2[1:0] are defined as EL1PCEN:EL1PCTEN + * When E2H=1, CNTHCTL_EL2[11:10] are defined as EL1PTEN:EL1PCTEN + * + * Note the single letter difference? Yet, the bits have the same + * function despite a different layout and a different name. + * + * We don't try to reconcile this mess. We just use the E2H=0 bits + * to generate something that is in the E2H=1 format, and live with + * it. You're welcome. + */ +static u64 get_sanitized_cnthctl(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + if (!vcpu_el2_e2h_is_set(vcpu)) + val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10; + + return val & ((CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN) << 10); +} + +static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu) +{ + if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) +{ + if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu) +{ + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV)); +} + +static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2); + + if (!vcpu_el2_e2h_is_set(vcpu)) + val = translate_cptr_el2_to_cpacr_el1(val); + + if (val & CPACR_EL1_TTA) + return BEHAVE_FORWARD_RW; + + return BEHAVE_HANDLE_LOCALLY; +} + +static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + unsigned int idx; + + + switch (sysreg) { + case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30): + case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30): + idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg); + break; + case SYS_PMXEVTYPER_EL0: + case SYS_PMXEVCNTR_EL0: + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); + break; + default: + /* Someone used this trap helper for something else... */ + KVM_BUG_ON(1, vcpu->kvm); + return BEHAVE_HANDLE_LOCALLY; + } + + if (kvm_pmu_counter_is_hyp(vcpu, idx)) + return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0; + + return BEHAVE_HANDLE_LOCALLY; +} + +#define CCC(id, fn) \ + [id - __COMPLEX_CONDITIONS__] = fn + +static const complex_condition_check ccc[] = { + CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), + CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), + CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct), + CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct), + CCC(CGT_CPTR_TTA, check_cptr_tta), + CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), +}; + +/* + * Bit assignment for the trap controls. We use a 64bit word with the + * following layout for each trapped sysreg: + * + * [9:0] enum cgt_group_id (10 bits) + * [13:10] enum fgt_group_id (4 bits) + * [19:14] bit number in the FGT register (6 bits) + * [20] trap polarity (1 bit) + * [25:21] FG filter (5 bits) + * [35:26] Main SysReg table index (10 bits) + * [62:36] Unused (27 bits) + * [63] RES0 - Must be zero, as lost on insertion in the xarray + */ +#define TC_CGT_BITS 10 +#define TC_FGT_BITS 4 +#define TC_FGF_BITS 5 +#define TC_SRI_BITS 10 + +union trap_config { + u64 val; + struct { + unsigned long cgt:TC_CGT_BITS; /* Coarse Grained Trap id */ + unsigned long fgt:TC_FGT_BITS; /* Fine Grained Trap id */ + unsigned long bit:6; /* Bit number */ + unsigned long pol:1; /* Polarity */ + unsigned long fgf:TC_FGF_BITS; /* Fine Grained Filter */ + unsigned long sri:TC_SRI_BITS; /* SysReg Index */ + unsigned long unused:27; /* Unused, should be zero */ + unsigned long mbz:1; /* Must Be Zero */ + }; +}; + +struct encoding_to_trap_config { + const u32 encoding; + const u32 end; + const union trap_config tc; + const unsigned int line; +}; + +/* + * WARNING: using ranges is a treacherous endeavour, as sysregs that + * are part of an architectural range are not necessarily contiguous + * in the [Op0,Op1,CRn,CRm,Ops] space. Tread carefully. + */ +#define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \ + { \ + .encoding = sr_start, \ + .end = sr_end, \ + .tc = { \ + .cgt = trap_id, \ + }, \ + .line = __LINE__, \ + } + +#define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id) + +/* + * Map encoding to trap bits for exception reported with EC=0x18. + * These must only be evaluated when running a nested hypervisor, but + * that the current context is not a hypervisor context. When the + * trapped access matches one of the trap controls, the exception is + * re-injected in the nested hypervisor. + */ +static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { + SR_TRAP(SYS_REVIDR_EL1, CGT_HCR_TID1), + SR_TRAP(SYS_AIDR_EL1, CGT_HCR_TID1), + SR_TRAP(SYS_SMIDR_EL1, CGT_HCR_TID1), + SR_TRAP(SYS_CTR_EL0, CGT_HCR_TID2), + SR_TRAP(SYS_CCSIDR_EL1, CGT_HCR_TID2_TID4), + SR_TRAP(SYS_CCSIDR2_EL1, CGT_HCR_TID2_TID4), + SR_TRAP(SYS_CLIDR_EL1, CGT_HCR_TID2_TID4), + SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), + SR_RANGE_TRAP(SYS_ID_PFR0_EL1, + sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), + SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0), + sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0), + sys_reg(3, 1, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 2, 11, 0, 0), + sys_reg(3, 2, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 3, 11, 0, 0), + sys_reg(3, 3, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 4, 11, 0, 0), + sys_reg(3, 4, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 5, 11, 0, 0), + sys_reg(3, 5, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 6, 11, 0, 0), + sys_reg(3, 6, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 7, 11, 0, 0), + sys_reg(3, 7, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 0, 15, 0, 0), + sys_reg(3, 0, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 1, 15, 0, 0), + sys_reg(3, 1, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 2, 15, 0, 0), + sys_reg(3, 2, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 3, 15, 0, 0), + sys_reg(3, 3, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 4, 15, 0, 0), + sys_reg(3, 4, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 5, 15, 0, 0), + sys_reg(3, 5, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 6, 15, 0, 0), + sys_reg(3, 6, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 7, 15, 0, 0), + sys_reg(3, 7, 15, 15, 7), CGT_HCR_TIDCP), + SR_TRAP(SYS_ACTLR_EL1, CGT_HCR_TACR), + SR_TRAP(SYS_DC_ISW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CISW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_IGSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_IGDSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CGSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CGDSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CIGSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CIGDSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CIVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CVAP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CVADP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_IVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CIGVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CIGDVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_IGVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_IGDVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGDVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGVAP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGDVAP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGVADP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGDVADP, CGT_HCR_TPC), + SR_TRAP(SYS_IC_IVAU, CGT_HCR_TPU_TOCU), + SR_TRAP(SYS_IC_IALLU, CGT_HCR_TPU_TOCU), + SR_TRAP(SYS_IC_IALLUIS, CGT_HCR_TPU_TICAB), + SR_TRAP(SYS_DC_CVAU, CGT_HCR_TPU_TOCU), + SR_TRAP(OP_TLBI_RVAE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAAE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVALE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAALE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VMALLE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_ASIDE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAAE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VALE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAALE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAAE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVALE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAALE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VMALLE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_ASIDE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAAE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VALE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAALE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAAE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVALE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAALE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VMALLE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_ASIDE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAAE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VALE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAALE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAAE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVALE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAALE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VMALLE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_ASIDE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAAE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VALE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAALE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VMALLE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_ASIDE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAAE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VALE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAALE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAAE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVALE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAALE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VMALLE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_ASIDE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAAE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VALE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAALE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAAE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVALE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAALE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(SYS_SCTLR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_SCTLR2_EL1, CGT_HCR_TVM_TRVM_HCRX_SCTLR2En), + SR_TRAP(SYS_TTBR0_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_TTBR1_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_TCR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_ESR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_FAR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_AFSR0_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_AFSR1_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIRE0_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL0, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En), + SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ), + SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ), + SR_TRAP(SYS_DC_GZVA, CGT_HCR_TDZ), + SR_TRAP(SYS_LORSA_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_LOREA_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_LORN_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_LORC_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_LORID_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_ERRIDR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERRSELR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXADDR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXCTLR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXFR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXMISC0_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXMISC1_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXMISC2_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXMISC3_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXSTATUS_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_APIAKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APIAKEYHI_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APIBKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APIBKEYHI_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APDAKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APDAKEYHI_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APDBKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APDBKEYHI_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APGAKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APGAKEYHI_EL1, CGT_HCR_APK), + /* All _EL2 registers */ + SR_TRAP(SYS_BRBCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VPIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VMPIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SCTLR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ACTLR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SCTLR2_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_HCR_EL2, + SYS_HCRX_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SMPRIMAP_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SMCR_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_TTBR0_EL2, + SYS_TCR2_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VTTBR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VTCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VNCR_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_HDFGRTR_EL2, + SYS_HAFGRTR_EL2, CGT_HCR_NV), + /* Skip the SP_EL1 encoding... */ + SR_TRAP(SYS_SPSR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ELR_EL2, CGT_HCR_NV), + /* Skip SPSR_irq, SPSR_abt, SPSR_und, SPSR_fiq */ + SR_TRAP(SYS_AFSR0_EL2, CGT_HCR_NV), + SR_TRAP(SYS_AFSR1_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ESR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VSESR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_TFSR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_FAR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_HPFAR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_PMSCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MAIR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_AMAIR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MPAMHCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MPAMVPMV_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MPAM2_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_MPAMVPM0_EL2, + SYS_MPAMVPM7_EL2, CGT_HCR_NV), + /* + * Note that the spec. describes a group of MEC registers + * whose access should not trap, therefore skip the following: + * MECID_A0_EL2, MECID_A1_EL2, MECID_P0_EL2, + * MECID_P1_EL2, MECIDR_EL2, VMECID_A_EL2, + * VMECID_P_EL2. + */ + SR_RANGE_TRAP(SYS_VBAR_EL2, + SYS_RMR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VDISR_EL2, CGT_HCR_NV), + /* ICH_AP0R<m>_EL2 */ + SR_RANGE_TRAP(SYS_ICH_AP0R0_EL2, + SYS_ICH_AP0R3_EL2, CGT_HCR_NV), + /* ICH_AP1R<m>_EL2 */ + SR_RANGE_TRAP(SYS_ICH_AP1R0_EL2, + SYS_ICH_AP1R3_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ICC_SRE_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_ICH_HCR_EL2, + SYS_ICH_EISR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ICH_ELRSR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ICH_VMCR_EL2, CGT_HCR_NV), + /* ICH_LR<m>_EL2 */ + SR_RANGE_TRAP(SYS_ICH_LR0_EL2, + SYS_ICH_LR15_EL2, CGT_HCR_NV), + SR_TRAP(SYS_CONTEXTIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_TPIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SCXTNUM_EL2, CGT_HCR_NV), + /* AMEVCNTVOFF0<n>_EL2, AMEVCNTVOFF1<n>_EL2 */ + SR_RANGE_TRAP(SYS_AMEVCNTVOFF0n_EL2(0), + SYS_AMEVCNTVOFF1n_EL2(15), CGT_HCR_NV), + /* CNT*_EL2 */ + SR_TRAP(SYS_CNTVOFF_EL2, CGT_HCR_NV), + SR_TRAP(SYS_CNTPOFF_EL2, CGT_HCR_NV), + SR_TRAP(SYS_CNTHCTL_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_CNTHP_TVAL_EL2, + SYS_CNTHP_CVAL_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2, + SYS_CNTHV_CVAL_EL2, CGT_HCR_NV), + /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/ + SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0), + sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV), + SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0), + sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV), + SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT), + SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT), + SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV), + SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV), + SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV), + SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV), + SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV), + SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV), + SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1ISNXS,CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1OSNXS,CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_CPP_RCTX, CGT_HCR_NV), + SR_TRAP(OP_DVP_RCTX, CGT_HCR_NV), + SR_TRAP(OP_CFP_RCTX, CGT_HCR_NV), + SR_TRAP(SYS_SP_EL1, CGT_HCR_NV_nNV2), + SR_TRAP(SYS_VBAR_EL1, CGT_HCR_NV1_nNV2), + SR_TRAP(SYS_ELR_EL1, CGT_HCR_NV1_nNV2), + SR_TRAP(SYS_SPSR_EL1, CGT_HCR_NV1_nNV2), + SR_TRAP(SYS_SCXTNUM_EL1, CGT_HCR_NV1_nNV2_ENSCXT), + SR_TRAP(SYS_SCXTNUM_EL0, CGT_HCR_ENSCXT), + SR_TRAP(OP_AT_S1E1R, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1W, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E0R, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT), + SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN), + SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN), + SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN), + SR_TRAP(SYS_PMCR_EL0, CGT_MDCR_TPM_TPMCR), + SR_TRAP(SYS_PMCNTENSET_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMCNTENCLR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMOVSSET_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM), + SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM), + SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM), + SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_OSDTRRX_EL1, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_OSDTRTX_EL1, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_DBGDTR_EL0, CGT_MDCR_TDCC_TDE_TDA), + /* + * Also covers DBGDTRRX_EL0, which has the same encoding as + * SYS_DBGDTRTX_EL0... + */ + SR_TRAP(SYS_DBGDTRTX_EL0, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_MDSCR_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_OSECCR_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(0), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(1), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(2), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(3), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(4), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(5), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(6), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(7), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(8), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(9), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(10), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(11), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(12), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(13), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(14), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(15), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(0), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(1), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(2), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(3), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(4), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(5), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(6), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(7), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(8), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(9), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(10), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(11), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(12), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(13), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(14), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(15), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(0), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(1), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(2), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(3), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(4), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(5), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(6), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(7), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(8), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(9), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(10), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(11), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(12), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(13), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(14), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(15), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(0), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(1), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(2), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(3), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(4), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(5), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(6), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(7), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(8), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(9), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(10), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(11), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(12), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(13), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(14), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGCLAIMSET_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGCLAIMCLR_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGAUTHSTATUS_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_OSLAR_EL1, CGT_MDCR_TDE_TDOSA), + SR_TRAP(SYS_OSLSR_EL1, CGT_MDCR_TDE_TDOSA), + SR_TRAP(SYS_OSDLR_EL1, CGT_MDCR_TDE_TDOSA), + SR_TRAP(SYS_DBGPRCR_EL1, CGT_MDCR_TDE_TDOSA), + SR_TRAP(SYS_MDRAR_EL1, CGT_MDCR_TDE_TDRA), + SR_TRAP(SYS_PMBLIMITR_EL1, CGT_MDCR_E2PB), + SR_TRAP(SYS_PMBPTR_EL1, CGT_MDCR_E2PB), + SR_TRAP(SYS_PMBSR_EL1, CGT_MDCR_E2PB), + SR_TRAP(SYS_PMSCR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSEVFR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSFCR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSICR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSIDR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSIRR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSLATFR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSNEVFR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSDSFR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_TRFCR_EL1, CGT_MDCR_TTRF), + SR_TRAP(SYS_TRBBASER_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBLIMITR_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBMAR_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBPTR_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBSR_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBTRG_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_CPACR_EL1, CGT_CPTR_TCPAC), + SR_TRAP(SYS_AMUSERENR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCFGR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCGCR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENCLR0_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENCLR1_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENSET0_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENSET1_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(4), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(5), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(6), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(7), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(8), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(9), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(10), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(11), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(12), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(13), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(14), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(15), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(4), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(5), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(6), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(7), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(8), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(9), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(10), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(11), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(12), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM), + /* op0=2, op1=1, and CRn<0b1000 */ + SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0), + sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA), + SR_TRAP(SYS_CNTP_TVAL_EL0, CGT_CNTHCTL_EL1PTEN), + SR_TRAP(SYS_CNTP_CVAL_EL0, CGT_CNTHCTL_EL1PTEN), + SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), + SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT), + SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT), + SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), + /* + * IMPDEF choice: + * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as + * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for + * ICC_SRE_EL1 access, and always handle it locally. + */ + SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR), + SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC), +}; + +static DEFINE_XARRAY(sr_forward_xa); + +enum fg_filter_id { + __NO_FGF__, + HCRX_FGTnXS, + + /* Must be last */ + __NR_FG_FILTER_IDS__ +}; + +#define __FGT(g, b, p, f) \ + { \ + .fgt = g ## _GROUP, \ + .bit = g ## _EL2_ ## b ## _SHIFT, \ + .pol = p, \ + .fgf = f, \ + } + +#define FGT(g, b, p) __FGT(g, b, p, __NO_FGF__) + +/* + * See the warning next to SR_RANGE_TRAP(), and apply the same + * level of caution. + */ +#define SR_FGF_RANGE(sr, e, g, b, p, f) \ + { \ + .encoding = sr, \ + .end = e, \ + .tc = __FGT(g, b, p, f), \ + .line = __LINE__, \ + } + +#define SR_FGF(sr, g, b, p, f) SR_FGF_RANGE(sr, sr, g, b, p, f) +#define SR_FGT(sr, g, b, p) SR_FGF_RANGE(sr, sr, g, b, p, __NO_FGF__) +#define SR_FGT_RANGE(sr, end, g, b, p) \ + SR_FGF_RANGE(sr, end, g, b, p, __NO_FGF__) + +static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { + /* HFGRTR_EL2, HFGWTR_EL2 */ + SR_FGT(SYS_AMAIR2_EL1, HFGRTR, nAMAIR2_EL1, 0), + SR_FGT(SYS_MAIR2_EL1, HFGRTR, nMAIR2_EL1, 0), + SR_FGT(SYS_S2POR_EL1, HFGRTR, nS2POR_EL1, 0), + SR_FGT(SYS_POR_EL1, HFGRTR, nPOR_EL1, 0), + SR_FGT(SYS_POR_EL0, HFGRTR, nPOR_EL0, 0), + SR_FGT(SYS_PIR_EL1, HFGRTR, nPIR_EL1, 0), + SR_FGT(SYS_PIRE0_EL1, HFGRTR, nPIRE0_EL1, 0), + SR_FGT(SYS_RCWMASK_EL1, HFGRTR, nRCWMASK_EL1, 0), + SR_FGT(SYS_TPIDR2_EL0, HFGRTR, nTPIDR2_EL0, 0), + SR_FGT(SYS_SMPRI_EL1, HFGRTR, nSMPRI_EL1, 0), + SR_FGT(SYS_GCSCR_EL1, HFGRTR, nGCS_EL1, 0), + SR_FGT(SYS_GCSPR_EL1, HFGRTR, nGCS_EL1, 0), + SR_FGT(SYS_GCSCRE0_EL1, HFGRTR, nGCS_EL0, 0), + SR_FGT(SYS_GCSPR_EL0, HFGRTR, nGCS_EL0, 0), + SR_FGT(SYS_ACCDATA_EL1, HFGRTR, nACCDATA_EL1, 0), + SR_FGT(SYS_ERXADDR_EL1, HFGRTR, ERXADDR_EL1, 1), + SR_FGT(SYS_ERXPFGCDN_EL1, HFGRTR, ERXPFGCDN_EL1, 1), + SR_FGT(SYS_ERXPFGCTL_EL1, HFGRTR, ERXPFGCTL_EL1, 1), + SR_FGT(SYS_ERXPFGF_EL1, HFGRTR, ERXPFGF_EL1, 1), + SR_FGT(SYS_ERXMISC0_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXMISC1_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXMISC2_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXMISC3_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXSTATUS_EL1, HFGRTR, ERXSTATUS_EL1, 1), + SR_FGT(SYS_ERXCTLR_EL1, HFGRTR, ERXCTLR_EL1, 1), + SR_FGT(SYS_ERXFR_EL1, HFGRTR, ERXFR_EL1, 1), + SR_FGT(SYS_ERRSELR_EL1, HFGRTR, ERRSELR_EL1, 1), + SR_FGT(SYS_ERRIDR_EL1, HFGRTR, ERRIDR_EL1, 1), + SR_FGT(SYS_ICC_IGRPEN0_EL1, HFGRTR, ICC_IGRPENn_EL1, 1), + SR_FGT(SYS_ICC_IGRPEN1_EL1, HFGRTR, ICC_IGRPENn_EL1, 1), + SR_FGT(SYS_VBAR_EL1, HFGRTR, VBAR_EL1, 1), + SR_FGT(SYS_TTBR1_EL1, HFGRTR, TTBR1_EL1, 1), + SR_FGT(SYS_TTBR0_EL1, HFGRTR, TTBR0_EL1, 1), + SR_FGT(SYS_TPIDR_EL0, HFGRTR, TPIDR_EL0, 1), + SR_FGT(SYS_TPIDRRO_EL0, HFGRTR, TPIDRRO_EL0, 1), + SR_FGT(SYS_TPIDR_EL1, HFGRTR, TPIDR_EL1, 1), + SR_FGT(SYS_TCR_EL1, HFGRTR, TCR_EL1, 1), + SR_FGT(SYS_TCR2_EL1, HFGRTR, TCR_EL1, 1), + SR_FGT(SYS_SCXTNUM_EL0, HFGRTR, SCXTNUM_EL0, 1), + SR_FGT(SYS_SCXTNUM_EL1, HFGRTR, SCXTNUM_EL1, 1), + SR_FGT(SYS_SCTLR_EL1, HFGRTR, SCTLR_EL1, 1), + SR_FGT(SYS_SCTLR2_EL1, HFGRTR, SCTLR_EL1, 1), + SR_FGT(SYS_REVIDR_EL1, HFGRTR, REVIDR_EL1, 1), + SR_FGT(SYS_PAR_EL1, HFGRTR, PAR_EL1, 1), + SR_FGT(SYS_MPIDR_EL1, HFGRTR, MPIDR_EL1, 1), + SR_FGT(SYS_MIDR_EL1, HFGRTR, MIDR_EL1, 1), + SR_FGT(SYS_MAIR_EL1, HFGRTR, MAIR_EL1, 1), + SR_FGT(SYS_LORSA_EL1, HFGRTR, LORSA_EL1, 1), + SR_FGT(SYS_LORN_EL1, HFGRTR, LORN_EL1, 1), + SR_FGT(SYS_LORID_EL1, HFGRTR, LORID_EL1, 1), + SR_FGT(SYS_LOREA_EL1, HFGRTR, LOREA_EL1, 1), + SR_FGT(SYS_LORC_EL1, HFGRTR, LORC_EL1, 1), + SR_FGT(SYS_ISR_EL1, HFGRTR, ISR_EL1, 1), + SR_FGT(SYS_FAR_EL1, HFGRTR, FAR_EL1, 1), + SR_FGT(SYS_ESR_EL1, HFGRTR, ESR_EL1, 1), + SR_FGT(SYS_DCZID_EL0, HFGRTR, DCZID_EL0, 1), + SR_FGT(SYS_CTR_EL0, HFGRTR, CTR_EL0, 1), + SR_FGT(SYS_CSSELR_EL1, HFGRTR, CSSELR_EL1, 1), + SR_FGT(SYS_CPACR_EL1, HFGRTR, CPACR_EL1, 1), + SR_FGT(SYS_CONTEXTIDR_EL1, HFGRTR, CONTEXTIDR_EL1, 1), + SR_FGT(SYS_CLIDR_EL1, HFGRTR, CLIDR_EL1, 1), + SR_FGT(SYS_CCSIDR_EL1, HFGRTR, CCSIDR_EL1, 1), + SR_FGT(SYS_APIBKEYLO_EL1, HFGRTR, APIBKey, 1), + SR_FGT(SYS_APIBKEYHI_EL1, HFGRTR, APIBKey, 1), + SR_FGT(SYS_APIAKEYLO_EL1, HFGRTR, APIAKey, 1), + SR_FGT(SYS_APIAKEYHI_EL1, HFGRTR, APIAKey, 1), + SR_FGT(SYS_APGAKEYLO_EL1, HFGRTR, APGAKey, 1), + SR_FGT(SYS_APGAKEYHI_EL1, HFGRTR, APGAKey, 1), + SR_FGT(SYS_APDBKEYLO_EL1, HFGRTR, APDBKey, 1), + SR_FGT(SYS_APDBKEYHI_EL1, HFGRTR, APDBKey, 1), + SR_FGT(SYS_APDAKEYLO_EL1, HFGRTR, APDAKey, 1), + SR_FGT(SYS_APDAKEYHI_EL1, HFGRTR, APDAKey, 1), + SR_FGT(SYS_AMAIR_EL1, HFGRTR, AMAIR_EL1, 1), + SR_FGT(SYS_AIDR_EL1, HFGRTR, AIDR_EL1, 1), + SR_FGT(SYS_AFSR1_EL1, HFGRTR, AFSR1_EL1, 1), + SR_FGT(SYS_AFSR0_EL1, HFGRTR, AFSR0_EL1, 1), + + /* HFGRTR2_EL2, HFGWTR2_EL2 */ + SR_FGT(SYS_ACTLRALIAS_EL1, HFGRTR2, nACTLRALIAS_EL1, 0), + SR_FGT(SYS_ACTLRMASK_EL1, HFGRTR2, nACTLRMASK_EL1, 0), + SR_FGT(SYS_CPACRALIAS_EL1, HFGRTR2, nCPACRALIAS_EL1, 0), + SR_FGT(SYS_CPACRMASK_EL1, HFGRTR2, nCPACRMASK_EL1, 0), + SR_FGT(SYS_PFAR_EL1, HFGRTR2, nPFAR_EL1, 0), + SR_FGT(SYS_RCWSMASK_EL1, HFGRTR2, nRCWSMASK_EL1, 0), + SR_FGT(SYS_SCTLR2ALIAS_EL1, HFGRTR2, nSCTLRALIAS2_EL1, 0), + SR_FGT(SYS_SCTLR2MASK_EL1, HFGRTR2, nSCTLR2MASK_EL1, 0), + SR_FGT(SYS_SCTLRALIAS_EL1, HFGRTR2, nSCTLRALIAS_EL1, 0), + SR_FGT(SYS_SCTLRMASK_EL1, HFGRTR2, nSCTLRMASK_EL1, 0), + SR_FGT(SYS_TCR2ALIAS_EL1, HFGRTR2, nTCR2ALIAS_EL1, 0), + SR_FGT(SYS_TCR2MASK_EL1, HFGRTR2, nTCR2MASK_EL1, 0), + SR_FGT(SYS_TCRALIAS_EL1, HFGRTR2, nTCRALIAS_EL1, 0), + SR_FGT(SYS_TCRMASK_EL1, HFGRTR2, nTCRMASK_EL1, 0), + SR_FGT(SYS_ERXGSR_EL1, HFGRTR2, nERXGSR_EL1, 0), + + /* HFGITR_EL2 */ + SR_FGT(OP_AT_S1E1A, HFGITR, ATS1E1A, 1), + SR_FGT(OP_COSP_RCTX, HFGITR, COSPRCTX, 1), + SR_FGT(OP_GCSPUSHX, HFGITR, nGCSEPP, 0), + SR_FGT(OP_GCSPOPX, HFGITR, nGCSEPP, 0), + SR_FGT(OP_GCSPUSHM, HFGITR, nGCSPUSHM_EL1, 0), + SR_FGT(OP_BRB_IALL, HFGITR, nBRBIALL, 0), + SR_FGT(OP_BRB_INJ, HFGITR, nBRBINJ, 0), + SR_FGT(SYS_DC_CVAC, HFGITR, DCCVAC, 1), + SR_FGT(SYS_DC_CGVAC, HFGITR, DCCVAC, 1), + SR_FGT(SYS_DC_CGDVAC, HFGITR, DCCVAC, 1), + SR_FGT(OP_CPP_RCTX, HFGITR, CPPRCTX, 1), + SR_FGT(OP_DVP_RCTX, HFGITR, DVPRCTX, 1), + SR_FGT(OP_CFP_RCTX, HFGITR, CFPRCTX, 1), + SR_FGT(OP_TLBI_VAALE1, HFGITR, TLBIVAALE1, 1), + SR_FGT(OP_TLBI_VALE1, HFGITR, TLBIVALE1, 1), + SR_FGT(OP_TLBI_VAAE1, HFGITR, TLBIVAAE1, 1), + SR_FGT(OP_TLBI_ASIDE1, HFGITR, TLBIASIDE1, 1), + SR_FGT(OP_TLBI_VAE1, HFGITR, TLBIVAE1, 1), + SR_FGT(OP_TLBI_VMALLE1, HFGITR, TLBIVMALLE1, 1), + SR_FGT(OP_TLBI_RVAALE1, HFGITR, TLBIRVAALE1, 1), + SR_FGT(OP_TLBI_RVALE1, HFGITR, TLBIRVALE1, 1), + SR_FGT(OP_TLBI_RVAAE1, HFGITR, TLBIRVAAE1, 1), + SR_FGT(OP_TLBI_RVAE1, HFGITR, TLBIRVAE1, 1), + SR_FGT(OP_TLBI_RVAALE1IS, HFGITR, TLBIRVAALE1IS, 1), + SR_FGT(OP_TLBI_RVALE1IS, HFGITR, TLBIRVALE1IS, 1), + SR_FGT(OP_TLBI_RVAAE1IS, HFGITR, TLBIRVAAE1IS, 1), + SR_FGT(OP_TLBI_RVAE1IS, HFGITR, TLBIRVAE1IS, 1), + SR_FGT(OP_TLBI_VAALE1IS, HFGITR, TLBIVAALE1IS, 1), + SR_FGT(OP_TLBI_VALE1IS, HFGITR, TLBIVALE1IS, 1), + SR_FGT(OP_TLBI_VAAE1IS, HFGITR, TLBIVAAE1IS, 1), + SR_FGT(OP_TLBI_ASIDE1IS, HFGITR, TLBIASIDE1IS, 1), + SR_FGT(OP_TLBI_VAE1IS, HFGITR, TLBIVAE1IS, 1), + SR_FGT(OP_TLBI_VMALLE1IS, HFGITR, TLBIVMALLE1IS, 1), + SR_FGT(OP_TLBI_RVAALE1OS, HFGITR, TLBIRVAALE1OS, 1), + SR_FGT(OP_TLBI_RVALE1OS, HFGITR, TLBIRVALE1OS, 1), + SR_FGT(OP_TLBI_RVAAE1OS, HFGITR, TLBIRVAAE1OS, 1), + SR_FGT(OP_TLBI_RVAE1OS, HFGITR, TLBIRVAE1OS, 1), + SR_FGT(OP_TLBI_VAALE1OS, HFGITR, TLBIVAALE1OS, 1), + SR_FGT(OP_TLBI_VALE1OS, HFGITR, TLBIVALE1OS, 1), + SR_FGT(OP_TLBI_VAAE1OS, HFGITR, TLBIVAAE1OS, 1), + SR_FGT(OP_TLBI_ASIDE1OS, HFGITR, TLBIASIDE1OS, 1), + SR_FGT(OP_TLBI_VAE1OS, HFGITR, TLBIVAE1OS, 1), + SR_FGT(OP_TLBI_VMALLE1OS, HFGITR, TLBIVMALLE1OS, 1), + /* nXS variants must be checked against HCRX_EL2.FGTnXS */ + SR_FGF(OP_TLBI_VAALE1NXS, HFGITR, TLBIVAALE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VALE1NXS, HFGITR, TLBIVALE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAAE1NXS, HFGITR, TLBIVAAE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_ASIDE1NXS, HFGITR, TLBIASIDE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAE1NXS, HFGITR, TLBIVAE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VMALLE1NXS, HFGITR, TLBIVMALLE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAALE1NXS, HFGITR, TLBIRVAALE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVALE1NXS, HFGITR, TLBIRVALE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAAE1NXS, HFGITR, TLBIRVAAE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAE1NXS, HFGITR, TLBIRVAE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAALE1ISNXS, HFGITR, TLBIRVAALE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVALE1ISNXS, HFGITR, TLBIRVALE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAAE1ISNXS, HFGITR, TLBIRVAAE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAE1ISNXS, HFGITR, TLBIRVAE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAALE1ISNXS, HFGITR, TLBIVAALE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VALE1ISNXS, HFGITR, TLBIVALE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAAE1ISNXS, HFGITR, TLBIVAAE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_ASIDE1ISNXS, HFGITR, TLBIASIDE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAE1ISNXS, HFGITR, TLBIVAE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VMALLE1ISNXS, HFGITR, TLBIVMALLE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAALE1OSNXS, HFGITR, TLBIRVAALE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVALE1OSNXS, HFGITR, TLBIRVALE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAAE1OSNXS, HFGITR, TLBIRVAAE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAE1OSNXS, HFGITR, TLBIRVAE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAALE1OSNXS, HFGITR, TLBIVAALE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VALE1OSNXS, HFGITR, TLBIVALE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAAE1OSNXS, HFGITR, TLBIVAAE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_ASIDE1OSNXS, HFGITR, TLBIASIDE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAE1OSNXS, HFGITR, TLBIVAE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VMALLE1OSNXS, HFGITR, TLBIVMALLE1OS, 1, HCRX_FGTnXS), + SR_FGT(OP_AT_S1E1WP, HFGITR, ATS1E1WP, 1), + SR_FGT(OP_AT_S1E1RP, HFGITR, ATS1E1RP, 1), + SR_FGT(OP_AT_S1E0W, HFGITR, ATS1E0W, 1), + SR_FGT(OP_AT_S1E0R, HFGITR, ATS1E0R, 1), + SR_FGT(OP_AT_S1E1W, HFGITR, ATS1E1W, 1), + SR_FGT(OP_AT_S1E1R, HFGITR, ATS1E1R, 1), + SR_FGT(SYS_DC_ZVA, HFGITR, DCZVA, 1), + SR_FGT(SYS_DC_GVA, HFGITR, DCZVA, 1), + SR_FGT(SYS_DC_GZVA, HFGITR, DCZVA, 1), + SR_FGT(SYS_DC_CIVAC, HFGITR, DCCIVAC, 1), + SR_FGT(SYS_DC_CIGVAC, HFGITR, DCCIVAC, 1), + SR_FGT(SYS_DC_CIGDVAC, HFGITR, DCCIVAC, 1), + SR_FGT(SYS_DC_CVADP, HFGITR, DCCVADP, 1), + SR_FGT(SYS_DC_CGVADP, HFGITR, DCCVADP, 1), + SR_FGT(SYS_DC_CGDVADP, HFGITR, DCCVADP, 1), + SR_FGT(SYS_DC_CVAP, HFGITR, DCCVAP, 1), + SR_FGT(SYS_DC_CGVAP, HFGITR, DCCVAP, 1), + SR_FGT(SYS_DC_CGDVAP, HFGITR, DCCVAP, 1), + SR_FGT(SYS_DC_CVAU, HFGITR, DCCVAU, 1), + SR_FGT(SYS_DC_CISW, HFGITR, DCCISW, 1), + SR_FGT(SYS_DC_CIGSW, HFGITR, DCCISW, 1), + SR_FGT(SYS_DC_CIGDSW, HFGITR, DCCISW, 1), + SR_FGT(SYS_DC_CSW, HFGITR, DCCSW, 1), + SR_FGT(SYS_DC_CGSW, HFGITR, DCCSW, 1), + SR_FGT(SYS_DC_CGDSW, HFGITR, DCCSW, 1), + SR_FGT(SYS_DC_ISW, HFGITR, DCISW, 1), + SR_FGT(SYS_DC_IGSW, HFGITR, DCISW, 1), + SR_FGT(SYS_DC_IGDSW, HFGITR, DCISW, 1), + SR_FGT(SYS_DC_IVAC, HFGITR, DCIVAC, 1), + SR_FGT(SYS_DC_IGVAC, HFGITR, DCIVAC, 1), + SR_FGT(SYS_DC_IGDVAC, HFGITR, DCIVAC, 1), + SR_FGT(SYS_IC_IVAU, HFGITR, ICIVAU, 1), + SR_FGT(SYS_IC_IALLU, HFGITR, ICIALLU, 1), + SR_FGT(SYS_IC_IALLUIS, HFGITR, ICIALLUIS, 1), + + /* HFGITR2_EL2 */ + SR_FGT(SYS_DC_CIGDVAPS, HFGITR2, nDCCIVAPS, 0), + SR_FGT(SYS_DC_CIVAPS, HFGITR2, nDCCIVAPS, 0), + + /* HDFGRTR_EL2 */ + SR_FGT(SYS_PMBIDR_EL1, HDFGRTR, PMBIDR_EL1, 1), + SR_FGT(SYS_PMSNEVFR_EL1, HDFGRTR, nPMSNEVFR_EL1, 0), + SR_FGT(SYS_BRBINF_EL1(0), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(1), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(2), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(3), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(4), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(5), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(6), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(7), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(8), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(9), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(10), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(11), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(12), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(13), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(14), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(15), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(16), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(17), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(18), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(19), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(20), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(21), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(22), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(23), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(24), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(25), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(26), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(27), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(28), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(29), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(30), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(31), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINFINJ_EL1, HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(0), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(1), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(2), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(3), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(4), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(5), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(6), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(7), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(8), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(9), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(10), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(11), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(12), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(13), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(14), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(15), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(16), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(17), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(18), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(19), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(20), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(21), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(22), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(23), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(24), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(25), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(26), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(27), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(28), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(29), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(30), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(31), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRCINJ_EL1, HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(0), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(1), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(2), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(3), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(4), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(5), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(6), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(7), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(8), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(9), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(10), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(11), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(12), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(13), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(14), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(15), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(16), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(17), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(18), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(19), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(20), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(21), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(22), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(23), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(24), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(25), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(26), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(27), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(28), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(29), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(30), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(31), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGTINJ_EL1, HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTS_EL1, HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBCR_EL1, HDFGRTR, nBRBCTL, 0), + SR_FGT(SYS_BRBFCR_EL1, HDFGRTR, nBRBCTL, 0), + SR_FGT(SYS_BRBIDR0_EL1, HDFGRTR, nBRBIDR, 0), + SR_FGT(SYS_PMCEID0_EL0, HDFGRTR, PMCEIDn_EL0, 1), + SR_FGT(SYS_PMCEID1_EL0, HDFGRTR, PMCEIDn_EL0, 1), + SR_FGT(SYS_PMUSERENR_EL0, HDFGRTR, PMUSERENR_EL0, 1), + SR_FGT(SYS_TRBTRG_EL1, HDFGRTR, TRBTRG_EL1, 1), + SR_FGT(SYS_TRBSR_EL1, HDFGRTR, TRBSR_EL1, 1), + SR_FGT(SYS_TRBPTR_EL1, HDFGRTR, TRBPTR_EL1, 1), + SR_FGT(SYS_TRBMAR_EL1, HDFGRTR, TRBMAR_EL1, 1), + SR_FGT(SYS_TRBLIMITR_EL1, HDFGRTR, TRBLIMITR_EL1, 1), + SR_FGT(SYS_TRBIDR_EL1, HDFGRTR, TRBIDR_EL1, 1), + SR_FGT(SYS_TRBBASER_EL1, HDFGRTR, TRBBASER_EL1, 1), + SR_FGT(SYS_TRCVICTLR, HDFGRTR, TRCVICTLR, 1), + SR_FGT(SYS_TRCSTATR, HDFGRTR, TRCSTATR, 1), + SR_FGT(SYS_TRCSSCSR(0), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(1), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(2), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(3), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(4), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(5), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(6), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(7), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSEQSTR, HDFGRTR, TRCSEQSTR, 1), + SR_FGT(SYS_TRCPRGCTLR, HDFGRTR, TRCPRGCTLR, 1), + SR_FGT(SYS_TRCOSLSR, HDFGRTR, TRCOSLSR, 1), + SR_FGT(SYS_TRCIMSPEC(0), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(1), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(2), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(3), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(4), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(5), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(6), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(7), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCDEVARCH, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCDEVID, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR0, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR1, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR2, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR3, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR4, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR5, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR6, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR7, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR8, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR9, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR10, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR11, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR12, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR13, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCCNTVR(0), HDFGRTR, TRCCNTVRn, 1), + SR_FGT(SYS_TRCCNTVR(1), HDFGRTR, TRCCNTVRn, 1), + SR_FGT(SYS_TRCCNTVR(2), HDFGRTR, TRCCNTVRn, 1), + SR_FGT(SYS_TRCCNTVR(3), HDFGRTR, TRCCNTVRn, 1), + SR_FGT(SYS_TRCCLAIMCLR, HDFGRTR, TRCCLAIM, 1), + SR_FGT(SYS_TRCCLAIMSET, HDFGRTR, TRCCLAIM, 1), + SR_FGT(SYS_TRCAUXCTLR, HDFGRTR, TRCAUXCTLR, 1), + SR_FGT(SYS_TRCAUTHSTATUS, HDFGRTR, TRCAUTHSTATUS, 1), + SR_FGT(SYS_TRCACATR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(8), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(9), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(10), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(11), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(12), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(13), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(14), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(15), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(8), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(9), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(10), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(11), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(12), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(13), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(14), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(15), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCBBCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCCCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCCTLR0, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCCTLR1, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTCTLR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTCTLR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTCTLR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTCTLR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTRLDVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTRLDVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTRLDVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTRLDVR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCONFIGR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEVENTCTL0R, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEVENTCTL1R, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEXTINSELR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEXTINSELR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEXTINSELR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEXTINSELR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCQCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(8), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(9), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(10), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(11), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(12), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(13), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(14), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(15), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(16), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(17), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(18), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(19), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(20), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(21), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(22), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(23), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(24), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(25), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(26), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(27), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(28), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(29), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(30), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(31), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSEQEVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSEQEVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSEQEVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSEQRSTEVR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSTALLCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSYNCPR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCTRACEIDR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCTSCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVIIECTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVIPCSSCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVISSCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCCTLR0, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCCTLR1, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_PMSLATFR_EL1, HDFGRTR, PMSLATFR_EL1, 1), + SR_FGT(SYS_PMSIRR_EL1, HDFGRTR, PMSIRR_EL1, 1), + SR_FGT(SYS_PMSIDR_EL1, HDFGRTR, PMSIDR_EL1, 1), + SR_FGT(SYS_PMSICR_EL1, HDFGRTR, PMSICR_EL1, 1), + SR_FGT(SYS_PMSFCR_EL1, HDFGRTR, PMSFCR_EL1, 1), + SR_FGT(SYS_PMSEVFR_EL1, HDFGRTR, PMSEVFR_EL1, 1), + SR_FGT(SYS_PMSCR_EL1, HDFGRTR, PMSCR_EL1, 1), + SR_FGT(SYS_PMBSR_EL1, HDFGRTR, PMBSR_EL1, 1), + SR_FGT(SYS_PMBPTR_EL1, HDFGRTR, PMBPTR_EL1, 1), + SR_FGT(SYS_PMBLIMITR_EL1, HDFGRTR, PMBLIMITR_EL1, 1), + SR_FGT(SYS_PMMIR_EL1, HDFGRTR, PMMIR_EL1, 1), + SR_FGT(SYS_PMSELR_EL0, HDFGRTR, PMSELR_EL0, 1), + SR_FGT(SYS_PMOVSCLR_EL0, HDFGRTR, PMOVS, 1), + SR_FGT(SYS_PMOVSSET_EL0, HDFGRTR, PMOVS, 1), + SR_FGT(SYS_PMINTENCLR_EL1, HDFGRTR, PMINTEN, 1), + SR_FGT(SYS_PMINTENSET_EL1, HDFGRTR, PMINTEN, 1), + SR_FGT(SYS_PMCNTENCLR_EL0, HDFGRTR, PMCNTEN, 1), + SR_FGT(SYS_PMCNTENSET_EL0, HDFGRTR, PMCNTEN, 1), + SR_FGT(SYS_PMCCNTR_EL0, HDFGRTR, PMCCNTR_EL0, 1), + SR_FGT(SYS_PMCCFILTR_EL0, HDFGRTR, PMCCFILTR_EL0, 1), + SR_FGT_RANGE(SYS_PMEVTYPERn_EL0(0), + SYS_PMEVTYPERn_EL0(30), + HDFGRTR, PMEVTYPERn_EL0, 1), + SR_FGT_RANGE(SYS_PMEVCNTRn_EL0(0), + SYS_PMEVCNTRn_EL0(30), + HDFGRTR, PMEVCNTRn_EL0, 1), + SR_FGT(SYS_OSDLR_EL1, HDFGRTR, OSDLR_EL1, 1), + SR_FGT(SYS_OSECCR_EL1, HDFGRTR, OSECCR_EL1, 1), + SR_FGT(SYS_OSLSR_EL1, HDFGRTR, OSLSR_EL1, 1), + SR_FGT(SYS_DBGPRCR_EL1, HDFGRTR, DBGPRCR_EL1, 1), + SR_FGT(SYS_DBGAUTHSTATUS_EL1, HDFGRTR, DBGAUTHSTATUS_EL1, 1), + SR_FGT(SYS_DBGCLAIMSET_EL1, HDFGRTR, DBGCLAIM, 1), + SR_FGT(SYS_DBGCLAIMCLR_EL1, HDFGRTR, DBGCLAIM, 1), + SR_FGT(SYS_MDSCR_EL1, HDFGRTR, MDSCR_EL1, 1), + /* + * The trap bits capture *64* debug registers per bit, but the + * ARM ARM only describes the encoding for the first 16, and + * we don't really support more than that anyway. + */ + SR_FGT(SYS_DBGWVRn_EL1(0), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(1), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(2), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(3), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(4), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(5), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(6), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(7), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(8), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(9), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(10), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(11), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(12), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(13), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(14), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(15), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(0), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(1), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(2), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(3), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(4), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(5), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(6), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(7), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(8), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(9), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(10), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(11), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(12), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(13), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(14), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(15), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(0), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(1), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(2), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(3), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(4), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(5), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(6), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(7), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(8), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(9), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(10), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(11), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(12), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(13), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(14), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(15), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(0), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(1), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(2), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(3), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(4), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(5), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(6), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(7), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(8), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(9), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(10), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(11), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(12), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(13), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(14), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(15), HDFGRTR, DBGBCRn_EL1, 1), + + /* HDFGRTR2_EL2 */ + SR_FGT(SYS_MDSELR_EL1, HDFGRTR2, nMDSELR_EL1, 0), + SR_FGT(SYS_MDSTEPOP_EL1, HDFGRTR2, nMDSTEPOP_EL1, 0), + SR_FGT(SYS_PMCCNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0), + SR_FGT_RANGE(SYS_PMEVCNTSVRn_EL1(0), + SYS_PMEVCNTSVRn_EL1(30), + HDFGRTR2, nPMSSDATA, 0), + SR_FGT(SYS_PMICNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0), + SR_FGT(SYS_PMECR_EL1, HDFGRTR2, nPMECR_EL1, 0), + SR_FGT(SYS_PMIAR_EL1, HDFGRTR2, nPMIAR_EL1, 0), + SR_FGT(SYS_PMICFILTR_EL0, HDFGRTR2, nPMICFILTR_EL0, 0), + SR_FGT(SYS_PMICNTR_EL0, HDFGRTR2, nPMICNTR_EL0, 0), + SR_FGT(SYS_PMSSCR_EL1, HDFGRTR2, nPMSSCR_EL1, 0), + SR_FGT(SYS_PMUACR_EL1, HDFGRTR2, nPMUACR_EL1, 0), + SR_FGT(SYS_SPMACCESSR_EL1, HDFGRTR2, nSPMACCESSR_EL1, 0), + SR_FGT(SYS_SPMCFGR_EL1, HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMDEVARCH_EL1, HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMCGCRn_EL1(0), HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMCGCRn_EL1(1), HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMIIDR_EL1, HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMCNTENCLR_EL0, HDFGRTR2, nSPMCNTEN, 0), + SR_FGT(SYS_SPMCNTENSET_EL0, HDFGRTR2, nSPMCNTEN, 0), + SR_FGT(SYS_SPMCR_EL0, HDFGRTR2, nSPMCR_EL0, 0), + SR_FGT(SYS_SPMDEVAFF_EL1, HDFGRTR2, nSPMDEVAFF_EL1, 0), + /* + * We have up to 64 of these registers in ranges of 16, banked via + * SPMSELR_EL0.BANK. We're only concerned with the accessors here, + * not the architectural registers. + */ + SR_FGT_RANGE(SYS_SPMEVCNTRn_EL0(0), + SYS_SPMEVCNTRn_EL0(15), + HDFGRTR2, nSPMEVCNTRn_EL0, 0), + SR_FGT_RANGE(SYS_SPMEVFILT2Rn_EL0(0), + SYS_SPMEVFILT2Rn_EL0(15), + HDFGRTR2, nSPMEVTYPERn_EL0, 0), + SR_FGT_RANGE(SYS_SPMEVFILTRn_EL0(0), + SYS_SPMEVFILTRn_EL0(15), + HDFGRTR2, nSPMEVTYPERn_EL0, 0), + SR_FGT_RANGE(SYS_SPMEVTYPERn_EL0(0), + SYS_SPMEVTYPERn_EL0(15), + HDFGRTR2, nSPMEVTYPERn_EL0, 0), + SR_FGT(SYS_SPMINTENCLR_EL1, HDFGRTR2, nSPMINTEN, 0), + SR_FGT(SYS_SPMINTENSET_EL1, HDFGRTR2, nSPMINTEN, 0), + SR_FGT(SYS_SPMOVSCLR_EL0, HDFGRTR2, nSPMOVS, 0), + SR_FGT(SYS_SPMOVSSET_EL0, HDFGRTR2, nSPMOVS, 0), + SR_FGT(SYS_SPMSCR_EL1, HDFGRTR2, nSPMSCR_EL1, 0), + SR_FGT(SYS_SPMSELR_EL0, HDFGRTR2, nSPMSELR_EL0, 0), + SR_FGT(SYS_TRCITECR_EL1, HDFGRTR2, nTRCITECR_EL1, 0), + SR_FGT(SYS_PMBMAR_EL1, HDFGRTR2, nPMBMAR_EL1, 0), + SR_FGT(SYS_PMSDSFR_EL1, HDFGRTR2, nPMSDSFR_EL1, 0), + SR_FGT(SYS_TRBMPAM_EL1, HDFGRTR2, nTRBMPAM_EL1, 0), + + /* + * HDFGWTR_EL2 + * + * Although HDFGRTR_EL2 and HDFGWTR_EL2 registers largely + * overlap in their bit assignment, there are a number of bits + * that are RES0 on one side, and an actual trap bit on the + * other. The policy chosen here is to describe all the + * read-side mappings, and only the write-side mappings that + * differ from the read side, and the trap handler will pick + * the correct shadow register based on the access type. + * + * Same model applies to the FEAT_FGT2 registers. + */ + SR_FGT(SYS_TRFCR_EL1, HDFGWTR, TRFCR_EL1, 1), + SR_FGT(SYS_TRCOSLAR, HDFGWTR, TRCOSLAR, 1), + SR_FGT(SYS_PMCR_EL0, HDFGWTR, PMCR_EL0, 1), + SR_FGT(SYS_PMSWINC_EL0, HDFGWTR, PMSWINC_EL0, 1), + SR_FGT(SYS_OSLAR_EL1, HDFGWTR, OSLAR_EL1, 1), + + /* HDFGWTR2_EL2 */ + SR_FGT(SYS_PMZR_EL0, HDFGWTR2, nPMZR_EL0, 0), + SR_FGT(SYS_SPMZR_EL0, HDFGWTR2, nSPMEVCNTRn_EL0, 0), + + /* + * HAFGRTR_EL2 + */ + SR_FGT(SYS_AMEVTYPER1_EL0(15), HAFGRTR, AMEVTYPER115_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(14), HAFGRTR, AMEVTYPER114_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(13), HAFGRTR, AMEVTYPER113_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(12), HAFGRTR, AMEVTYPER112_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(11), HAFGRTR, AMEVTYPER111_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(10), HAFGRTR, AMEVTYPER110_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(9), HAFGRTR, AMEVTYPER19_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(8), HAFGRTR, AMEVTYPER18_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(7), HAFGRTR, AMEVTYPER17_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(6), HAFGRTR, AMEVTYPER16_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(5), HAFGRTR, AMEVTYPER15_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(4), HAFGRTR, AMEVTYPER14_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(3), HAFGRTR, AMEVTYPER13_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(2), HAFGRTR, AMEVTYPER12_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(1), HAFGRTR, AMEVTYPER11_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(0), HAFGRTR, AMEVTYPER10_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(15), HAFGRTR, AMEVCNTR115_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(14), HAFGRTR, AMEVCNTR114_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(13), HAFGRTR, AMEVCNTR113_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(12), HAFGRTR, AMEVCNTR112_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(11), HAFGRTR, AMEVCNTR111_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(10), HAFGRTR, AMEVCNTR110_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(9), HAFGRTR, AMEVCNTR19_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(8), HAFGRTR, AMEVCNTR18_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(7), HAFGRTR, AMEVCNTR17_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(6), HAFGRTR, AMEVCNTR16_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(5), HAFGRTR, AMEVCNTR15_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(4), HAFGRTR, AMEVCNTR14_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(3), HAFGRTR, AMEVCNTR13_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(2), HAFGRTR, AMEVCNTR12_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(1), HAFGRTR, AMEVCNTR11_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(0), HAFGRTR, AMEVCNTR10_EL0, 1), + SR_FGT(SYS_AMCNTENCLR1_EL0, HAFGRTR, AMCNTEN1, 1), + SR_FGT(SYS_AMCNTENSET1_EL0, HAFGRTR, AMCNTEN1, 1), + SR_FGT(SYS_AMCNTENCLR0_EL0, HAFGRTR, AMCNTEN0, 1), + SR_FGT(SYS_AMCNTENSET0_EL0, HAFGRTR, AMCNTEN0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(3), HAFGRTR, AMEVCNTR03_EL0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(2), HAFGRTR, AMEVCNTR02_EL0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(1), HAFGRTR, AMEVCNTR01_EL0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1), +}; + +/* + * Additional FGTs that do not fire with ESR_EL2.EC==0x18. This table + * isn't used for exception routing, but only as a promise that the + * trap is handled somewhere else. + */ +static const union trap_config non_0x18_fgt[] __initconst = { + FGT(HFGITR, PSBCSYNC, 1), + FGT(HFGITR, nGCSSTR_EL1, 0), + FGT(HFGITR, SVC_EL1, 1), + FGT(HFGITR, SVC_EL0, 1), + FGT(HFGITR, ERET, 1), + FGT(HFGITR2, TSBCSYNC, 1), +}; + +static union trap_config get_trap_config(u32 sysreg) +{ + return (union trap_config) { + .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)), + }; +} + +static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc, + const char *type, int err) +{ + kvm_err("%s line %d encoding range " + "(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n", + type, tc->line, + sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding), + sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding), + sys_reg_Op2(tc->encoding), + sys_reg_Op0(tc->end), sys_reg_Op1(tc->end), + sys_reg_CRn(tc->end), sys_reg_CRm(tc->end), + sys_reg_Op2(tc->end), + err); +} + +static u32 encoding_next(u32 encoding) +{ + u8 op0, op1, crn, crm, op2; + + op0 = sys_reg_Op0(encoding); + op1 = sys_reg_Op1(encoding); + crn = sys_reg_CRn(encoding); + crm = sys_reg_CRm(encoding); + op2 = sys_reg_Op2(encoding); + + if (op2 < Op2_mask) + return sys_reg(op0, op1, crn, crm, op2 + 1); + if (crm < CRm_mask) + return sys_reg(op0, op1, crn, crm + 1, 0); + if (crn < CRn_mask) + return sys_reg(op0, op1, crn + 1, 0, 0); + if (op1 < Op1_mask) + return sys_reg(op0, op1 + 1, 0, 0, 0); + + return sys_reg(op0 + 1, 0, 0, 0, 0); +} + +#define FGT_MASKS(__n, __m) \ + struct fgt_masks __n = { .str = #__m, .res0 = __m, } + +FGT_MASKS(hfgrtr_masks, HFGRTR_EL2_RES0); +FGT_MASKS(hfgwtr_masks, HFGWTR_EL2_RES0); +FGT_MASKS(hfgitr_masks, HFGITR_EL2_RES0); +FGT_MASKS(hdfgrtr_masks, HDFGRTR_EL2_RES0); +FGT_MASKS(hdfgwtr_masks, HDFGWTR_EL2_RES0); +FGT_MASKS(hafgrtr_masks, HAFGRTR_EL2_RES0); +FGT_MASKS(hfgrtr2_masks, HFGRTR2_EL2_RES0); +FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2_RES0); +FGT_MASKS(hfgitr2_masks, HFGITR2_EL2_RES0); +FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2_RES0); +FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2_RES0); + +static __init bool aggregate_fgt(union trap_config tc) +{ + struct fgt_masks *rmasks, *wmasks; + + switch (tc.fgt) { + case HFGRTR_GROUP: + rmasks = &hfgrtr_masks; + wmasks = &hfgwtr_masks; + break; + case HDFGRTR_GROUP: + rmasks = &hdfgrtr_masks; + wmasks = &hdfgwtr_masks; + break; + case HAFGRTR_GROUP: + rmasks = &hafgrtr_masks; + wmasks = NULL; + break; + case HFGITR_GROUP: + rmasks = &hfgitr_masks; + wmasks = NULL; + break; + case HFGRTR2_GROUP: + rmasks = &hfgrtr2_masks; + wmasks = &hfgwtr2_masks; + break; + case HDFGRTR2_GROUP: + rmasks = &hdfgrtr2_masks; + wmasks = &hdfgwtr2_masks; + break; + case HFGITR2_GROUP: + rmasks = &hfgitr2_masks; + wmasks = NULL; + break; + } + + /* + * A bit can be reserved in either the R or W register, but + * not both. + */ + if ((BIT(tc.bit) & rmasks->res0) && + (!wmasks || (BIT(tc.bit) & wmasks->res0))) + return false; + + if (tc.pol) + rmasks->mask |= BIT(tc.bit) & ~rmasks->res0; + else + rmasks->nmask |= BIT(tc.bit) & ~rmasks->res0; + + if (wmasks) { + if (tc.pol) + wmasks->mask |= BIT(tc.bit) & ~wmasks->res0; + else + wmasks->nmask |= BIT(tc.bit) & ~wmasks->res0; + } + + return true; +} + +static __init int check_fgt_masks(struct fgt_masks *masks) +{ + unsigned long duplicate = masks->mask & masks->nmask; + u64 res0 = masks->res0; + int ret = 0; + + if (duplicate) { + int i; + + for_each_set_bit(i, &duplicate, 64) { + kvm_err("%s[%d] bit has both polarities\n", + masks->str, i); + } + + ret = -EINVAL; + } + + masks->res0 = ~(masks->mask | masks->nmask); + if (masks->res0 != res0) + kvm_info("Implicit %s = %016llx, expecting %016llx\n", + masks->str, masks->res0, res0); + + return ret; +} + +static __init int check_all_fgt_masks(int ret) +{ + static struct fgt_masks * const masks[] __initconst = { + &hfgrtr_masks, + &hfgwtr_masks, + &hfgitr_masks, + &hdfgrtr_masks, + &hdfgwtr_masks, + &hafgrtr_masks, + &hfgrtr2_masks, + &hfgwtr2_masks, + &hfgitr2_masks, + &hdfgrtr2_masks, + &hdfgwtr2_masks, + }; + int err = 0; + + for (int i = 0; i < ARRAY_SIZE(masks); i++) + err |= check_fgt_masks(masks[i]); + + return ret ?: err; +} + +#define for_each_encoding_in(__x, __s, __e) \ + for (u32 __x = __s; __x <= __e; __x = encoding_next(__x)) + +int __init populate_nv_trap_config(void) +{ + int ret = 0; + + BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *)); + BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS)); + BUILD_BUG_ON(__NR_FGT_GROUP_IDS__ > BIT(TC_FGT_BITS)); + BUILD_BUG_ON(__NR_FG_FILTER_IDS__ > BIT(TC_FGF_BITS)); + BUILD_BUG_ON(__HCRX_EL2_MASK & __HCRX_EL2_nMASK); + + for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) { + const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i]; + void *prev; + + if (cgt->tc.val & BIT(63)) { + kvm_err("CGT[%d] has MBZ bit set\n", i); + ret = -EINVAL; + } + + for_each_encoding_in(enc, cgt->encoding, cgt->end) { + prev = xa_store(&sr_forward_xa, enc, + xa_mk_value(cgt->tc.val), GFP_KERNEL); + if (prev && !xa_is_err(prev)) { + ret = -EINVAL; + print_nv_trap_error(cgt, "Duplicate CGT", ret); + } + + if (xa_is_err(prev)) { + ret = xa_err(prev); + print_nv_trap_error(cgt, "Failed CGT insertion", ret); + } + } + } + + if (__HCRX_EL2_RES0 != HCRX_EL2_RES0) + kvm_info("Sanitised HCR_EL2_RES0 = %016llx, expecting %016llx\n", + __HCRX_EL2_RES0, HCRX_EL2_RES0); + + kvm_info("nv: %ld coarse grained trap handlers\n", + ARRAY_SIZE(encoding_to_cgt)); + + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + goto check_mcb; + + for (int i = 0; i < ARRAY_SIZE(encoding_to_fgt); i++) { + const struct encoding_to_trap_config *fgt = &encoding_to_fgt[i]; + union trap_config tc; + void *prev; + + if (fgt->tc.fgt >= __NR_FGT_GROUP_IDS__) { + ret = -EINVAL; + print_nv_trap_error(fgt, "Invalid FGT", ret); + } + + for_each_encoding_in(enc, fgt->encoding, fgt->end) { + tc = get_trap_config(enc); + + if (tc.fgt) { + ret = -EINVAL; + print_nv_trap_error(fgt, "Duplicate FGT", ret); + } + + tc.val |= fgt->tc.val; + prev = xa_store(&sr_forward_xa, enc, + xa_mk_value(tc.val), GFP_KERNEL); + + if (xa_is_err(prev)) { + ret = xa_err(prev); + print_nv_trap_error(fgt, "Failed FGT insertion", ret); + } + + if (!aggregate_fgt(tc)) { + ret = -EINVAL; + print_nv_trap_error(fgt, "FGT bit is reserved", ret); + } + } + } + + for (int i = 0; i < ARRAY_SIZE(non_0x18_fgt); i++) { + if (!aggregate_fgt(non_0x18_fgt[i])) { + ret = -EINVAL; + kvm_err("non_0x18_fgt[%d] is reserved\n", i); + } + } + + ret = check_all_fgt_masks(ret); + + kvm_info("nv: %ld fine grained trap handlers\n", + ARRAY_SIZE(encoding_to_fgt)); + +check_mcb: + for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) { + const enum cgt_group_id *cgids; + + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; + + for (int i = 0; cgids[i] != __RESERVED__; i++) { + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ && + cgids[i] < __COMPLEX_CONDITIONS__) { + kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); + ret = -EINVAL; + } + } + } + + if (ret) + xa_destroy(&sr_forward_xa); + + return ret; +} + +int __init populate_sysreg_config(const struct sys_reg_desc *sr, + unsigned int idx) +{ + union trap_config tc; + u32 encoding; + void *ret; + + /* + * 0 is a valid value for the index, but not for the storage. + * We'll store (idx+1), so check against an offset'd limit. + */ + if (idx >= (BIT(TC_SRI_BITS) - 1)) { + kvm_err("sysreg %s (%d) out of range\n", sr->name, idx); + return -EINVAL; + } + + encoding = sys_reg(sr->Op0, sr->Op1, sr->CRn, sr->CRm, sr->Op2); + tc = get_trap_config(encoding); + + if (tc.sri) { + kvm_err("sysreg %s (%d) duplicate entry (%d)\n", + sr->name, idx - 1, tc.sri); + return -EINVAL; + } + + tc.sri = idx + 1; + ret = xa_store(&sr_forward_xa, encoding, + xa_mk_value(tc.val), GFP_KERNEL); + + return xa_err(ret); +} + +static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, + const struct trap_bits *tb) +{ + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; + u64 val; + + val = __vcpu_sys_reg(vcpu, tb->index); + if ((val & tb->mask) == tb->value) + b |= tb->behaviour; + + return b; +} + +static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu, + const enum cgt_group_id id, + enum trap_behaviour b) +{ + switch (id) { + const enum cgt_group_id *cgids; + + case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1: + if (likely(id != __RESERVED__)) + b |= get_behaviour(vcpu, &coarse_trap_bits[id]); + break; + case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1: + /* Yes, this is recursive. Don't do anything stupid. */ + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; + for (int i = 0; cgids[i] != __RESERVED__; i++) + b |= __compute_trap_behaviour(vcpu, cgids[i], b); + break; + default: + if (ARRAY_SIZE(ccc)) + b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu); + break; + } + + return b; +} + +static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, + const union trap_config tc) +{ + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; + + return __compute_trap_behaviour(vcpu, tc.cgt, b); +} + +static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) +{ + struct kvm_sysreg_masks *masks; + + /* Only handle the VNCR-backed regs for now */ + if (sr < __VNCR_START__) + return 0; + + masks = kvm->arch.sysreg_masks; + + return masks->mask[sr - __VNCR_START__].res0; +} + +static bool check_fgt_bit(struct kvm_vcpu *vcpu, enum vcpu_sysreg sr, + const union trap_config tc) +{ + struct kvm *kvm = vcpu->kvm; + u64 val; + + /* + * KVM doesn't know about any FGTs that apply to the host, and hopefully + * that'll remain the case. + */ + if (is_hyp_ctxt(vcpu)) + return false; + + val = __vcpu_sys_reg(vcpu, sr); + + if (tc.pol) + return (val & BIT(tc.bit)); + + /* + * FGTs with negative polarities are an absolute nightmare, as + * we need to evaluate the bit in the light of the feature + * that defines it. WTF were they thinking? + * + * So let's check if the bit has been earmarked as RES0, as + * this indicates an unimplemented feature. + */ + if (val & BIT(tc.bit)) + return false; + + return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit)); +} + +bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) +{ + enum vcpu_sysreg fgtreg; + union trap_config tc; + enum trap_behaviour b; + bool is_read; + u32 sysreg; + u64 esr; + + esr = kvm_vcpu_get_esr(vcpu); + sysreg = esr_sys64_to_sysreg(esr); + is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + + tc = get_trap_config(sysreg); + + /* + * A value of 0 for the whole entry means that we know nothing + * for this sysreg, and that it cannot be re-injected into the + * nested hypervisor. In this situation, let's cut it short. + */ + if (!tc.val) + goto local; + + /* + * If a sysreg can be trapped using a FGT, first check whether we + * trap for the purpose of forbidding the feature. In that case, + * inject an UNDEF. + */ + if (tc.fgt != __NO_FGT_GROUP__ && + (vcpu->kvm->arch.fgu[tc.fgt] & BIT(tc.bit))) { + kvm_inject_undefined(vcpu); + return true; + } + + /* + * If we're not nesting, immediately return to the caller, with the + * sysreg index, should we have it. + */ + if (!vcpu_has_nv(vcpu)) + goto local; + + /* + * There are a few traps that take effect InHost, but are constrained + * to EL0. Don't bother with computing the trap behaviour if the vCPU + * isn't in EL0. + */ + if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu)) + goto local; + + switch ((enum fgt_group_id)tc.fgt) { + case __NO_FGT_GROUP__: + break; + + case HFGRTR_GROUP: + fgtreg = is_read ? HFGRTR_EL2 : HFGWTR_EL2; + break; + + case HDFGRTR_GROUP: + fgtreg = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2; + break; + + case HAFGRTR_GROUP: + fgtreg = HAFGRTR_EL2; + break; + + case HFGITR_GROUP: + fgtreg = HFGITR_EL2; + switch (tc.fgf) { + u64 tmp; + + case __NO_FGF__: + break; + + case HCRX_FGTnXS: + tmp = __vcpu_sys_reg(vcpu, HCRX_EL2); + if (tmp & HCRX_EL2_FGTnXS) + tc.fgt = __NO_FGT_GROUP__; + } + break; + + case HFGRTR2_GROUP: + fgtreg = is_read ? HFGRTR2_EL2 : HFGWTR2_EL2; + break; + + case HDFGRTR2_GROUP: + fgtreg = is_read ? HDFGRTR2_EL2 : HDFGWTR2_EL2; + break; + + case HFGITR2_GROUP: + fgtreg = HFGITR2_EL2; + break; + + default: + /* Something is really wrong, bail out */ + WARN_ONCE(1, "Bad FGT group (encoding %08x, config %016llx)\n", + sysreg, tc.val); + goto local; + } + + if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, fgtreg, tc)) + goto inject; + + b = compute_trap_behaviour(vcpu, tc); + + if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu)) + goto local; + + if (((b & BEHAVE_FORWARD_READ) && is_read) || + ((b & BEHAVE_FORWARD_WRITE) && !is_read)) + goto inject; + +local: + if (!tc.sri) { + struct sys_reg_params params; + + params = esr_sys64_to_params(esr); + + /* + * Check for the IMPDEF range, as per DDI0487 J.a, + * D18.3.2 Reserved encodings for IMPLEMENTATION + * DEFINED registers. + */ + if (!(params.Op0 == 3 && (params.CRn & 0b1011) == 0b1011)) + print_sys_reg_msg(¶ms, + "Unsupported guest access at: %lx\n", + *vcpu_pc(vcpu)); + kvm_inject_undefined(vcpu); + return true; + } + + *sr_index = tc.sri - 1; + return false; + +inject: + trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read); + + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return true; +} + +static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit) +{ + if (is_nested_ctxt(vcpu) && + (__vcpu_sys_reg(vcpu, reg) & control_bit)) { + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return true; + } + return false; +} + +static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, HCR_EL2, control_bit); +} + +bool forward_smc_trap(struct kvm_vcpu *vcpu) +{ + return forward_hcr_traps(vcpu, HCR_TSC); +} + +static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, MDCR_EL2, control_bit); +} + +bool forward_debug_exception(struct kvm_vcpu *vcpu) +{ + return forward_mdcr_traps(vcpu, MDCR_EL2_TDE); +} + +static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) +{ + u64 mode = spsr & PSR_MODE_MASK; + + /* + * Possible causes for an Illegal Exception Return from EL2: + * - trying to return to EL3 + * - trying to return to an illegal M value + * - trying to return to a 32bit EL + * - trying to return to EL1 with HCR_EL2.TGE set + */ + if (mode == PSR_MODE_EL3t || mode == PSR_MODE_EL3h || + mode == 0b00001 || (mode & BIT(1)) || + (spsr & PSR_MODE32_BIT) || + (vcpu_el2_tge_is_set(vcpu) && (mode == PSR_MODE_EL1t || + mode == PSR_MODE_EL1h))) { + /* + * The guest is playing with our nerves. Preserve EL, SP, + * masks, flags from the existing PSTATE, and set IL. + * The HW will then generate an Illegal State Exception + * immediately after ERET. + */ + spsr = *vcpu_cpsr(vcpu); + + spsr &= (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | + PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT | + PSR_MODE_MASK | PSR_MODE32_BIT); + spsr |= PSR_IL_BIT; + } + + return spsr; +} + +void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) +{ + u64 spsr, elr, esr; + + spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); + spsr = kvm_check_illegal_exception_return(vcpu, spsr); + + /* Check for an ERETAx */ + esr = kvm_vcpu_get_esr(vcpu); + if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) { + /* + * Oh no, ERETAx failed to authenticate. + * + * If we have FPACCOMBINE and we don't have a pending + * Illegal Execution State exception (which has priority + * over FPAC), deliver an exception right away. + * + * Otherwise, let the mangled ELR value trickle down the + * ERET handling, and the guest will have a little surprise. + */ + if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) { + esr &= ESR_ELx_ERET_ISS_ERETA; + esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC); + kvm_inject_nested_sync(vcpu, esr); + return; + } + } + + preempt_disable(); + vcpu_set_flag(vcpu, IN_NESTED_ERET); + kvm_arch_vcpu_put(vcpu); + + if (!esr_iss_is_eretax(esr)) + elr = __vcpu_sys_reg(vcpu, ELR_EL2); + + trace_kvm_nested_eret(vcpu, elr, spsr); + + *vcpu_pc(vcpu) = elr; + *vcpu_cpsr(vcpu) = spsr; + + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + vcpu_clear_flag(vcpu, IN_NESTED_ERET); + preempt_enable(); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_nested_transition(vcpu); +} + +static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, + enum exception_type type) +{ + trace_kvm_inject_nested_exception(vcpu, esr_el2, type); + + switch (type) { + case except_type_sync: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC); + vcpu_write_sys_reg(vcpu, esr_el2, ESR_EL2); + break; + case except_type_irq: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ); + break; + case except_type_serror: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR); + break; + default: + WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type); + } +} + +/* + * Emulate taking an exception to EL2. + * See ARM ARM J8.1.2 AArch64.TakeException() + */ +static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, + enum exception_type type) +{ + u64 pstate, mode; + bool direct_inject; + + if (!vcpu_has_nv(vcpu)) { + kvm_err("Unexpected call to %s for the non-nesting configuration\n", + __func__); + return -EINVAL; + } + + /* + * As for ERET, we can avoid doing too much on the injection path by + * checking that we either took the exception from a VHE host + * userspace or from vEL2. In these cases, there is no change in + * translation regime (or anything else), so let's do as little as + * possible. + */ + pstate = *vcpu_cpsr(vcpu); + mode = pstate & (PSR_MODE_MASK | PSR_MODE32_BIT); + + direct_inject = (mode == PSR_MODE_EL0t && + vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)); + direct_inject |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t); + + if (direct_inject) { + kvm_inject_el2_exception(vcpu, esr_el2, type); + return 1; + } + + preempt_disable(); + + /* + * We may have an exception or PC update in the EL0/EL1 context. + * Commit it before entering EL2. + */ + __kvm_adjust_pc(vcpu); + + kvm_arch_vcpu_put(vcpu); + + kvm_inject_el2_exception(vcpu, esr_el2, type); + + /* + * A hard requirement is that a switch between EL1 and EL2 + * contexts has to happen between a put/load, so that we can + * pick the correct timer and interrupt configuration, among + * other things. + * + * Make sure the exception actually took place before we load + * the new context. + */ + __kvm_adjust_pc(vcpu); + + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_nested_transition(vcpu); + + return 1; +} + +int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2) +{ + return kvm_inject_nested(vcpu, esr_el2, except_type_sync); +} + +int kvm_inject_nested_irq(struct kvm_vcpu *vcpu) +{ + /* + * Do not inject an irq if the: + * - Current exception level is EL2, and + * - virtual HCR_EL2.TGE == 0 + * - virtual HCR_EL2.IMO == 0 + * + * See Table D1-17 "Physical interrupt target and masking when EL3 is + * not implemented and EL2 is implemented" in ARM DDI 0487C.a. + */ + + if (vcpu_is_el2(vcpu) && !vcpu_el2_tge_is_set(vcpu) && + !(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_IMO)) + return 1; + + /* esr_el2 value doesn't matter for exits due to irqs. */ + return kvm_inject_nested(vcpu, 0, except_type_irq); +} + +int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr) +{ + u64 esr = FIELD_PREP(ESR_ELx_EC_MASK, + iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW); + esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL; + + vcpu_write_sys_reg(vcpu, addr, FAR_EL2); + + if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE) + return kvm_inject_nested(vcpu, esr, except_type_serror); + + return kvm_inject_nested_sync(vcpu, esr); +} + +int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr) +{ + /* + * Hardware sets up the EC field when propagating ESR as a result of + * vSError injection. Manually populate EC for an emulated SError + * exception. + */ + esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR); + return kvm_inject_nested(vcpu, esr, except_type_serror); +} diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c new file mode 100644 index 000000000000..15e17aca1dec --- /dev/null +++ b/arch/arm64/kvm/fpsimd.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arch/arm64/kvm/fpsimd.c: Guest/host FPSIMD context coordination helpers + * + * Copyright 2018 Arm Limited + * Author: Dave Martin <Dave.Martin@arm.com> + */ +#include <linux/irqflags.h> +#include <linux/sched.h> +#include <linux/kvm_host.h> +#include <asm/fpsimd.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/sysreg.h> + +/* + * Prepare vcpu for saving the host's FPSIMD state and loading the guest's. + * The actual loading is done by the FPSIMD access trap taken to hyp. + * + * Here, we just set the correct metadata to indicate that the FPSIMD + * state in the cpu regs (if any) belongs to current on the host. + */ +void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) +{ + BUG_ON(!current->mm); + + if (!system_supports_fpsimd()) + return; + + /* + * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such + * that the host kernel is responsible for restoring this state upon + * return to userspace, and the hyp code doesn't need to save anything. + * + * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures + * that PSTATE.{SM,ZA} == {0,0}. + */ + fpsimd_save_and_flush_cpu_state(); + *host_data_ptr(fp_owner) = FP_STATE_FREE; + + WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR)); +} + +/* + * Called just before entering the guest once we are no longer preemptible + * and interrupts are disabled. If we have managed to run anything using + * FP while we were preemptible (such as off the back of an interrupt), + * then neither the host nor the guest own the FP hardware (and it was the + * responsibility of the code that used FP to save the existing state). + */ +void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) +{ + if (test_thread_flag(TIF_FOREIGN_FPSTATE)) + *host_data_ptr(fp_owner) = FP_STATE_FREE; +} + +/* + * Called just after exiting the guest. If the guest FPSIMD state + * was loaded, update the host's context tracking data mark the CPU + * FPSIMD regs as dirty and belonging to vcpu so that they will be + * written back if the kernel clobbers them due to kernel-mode NEON + * before re-entry into the guest. + */ +void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) +{ + struct cpu_fp_state fp_state; + + WARN_ON_ONCE(!irqs_disabled()); + + if (guest_owns_fp_regs()) { + /* + * Currently we do not support SME guests so SVCR is + * always 0 and we just need a variable to point to. + */ + fp_state.st = &vcpu->arch.ctxt.fp_regs; + fp_state.sve_state = vcpu->arch.sve_state; + fp_state.sve_vl = vcpu->arch.sve_max_vl; + fp_state.sme_state = NULL; + fp_state.svcr = __ctxt_sys_reg(&vcpu->arch.ctxt, SVCR); + fp_state.fpmr = __ctxt_sys_reg(&vcpu->arch.ctxt, FPMR); + fp_state.fp_type = &vcpu->arch.fp_type; + + if (vcpu_has_sve(vcpu)) + fp_state.to_save = FP_STATE_SVE; + else + fp_state.to_save = FP_STATE_FPSIMD; + + fpsimd_bind_state_to_cpu(&fp_state); + + clear_thread_flag(TIF_FOREIGN_FPSTATE); + } +} + +/* + * Write back the vcpu FPSIMD regs if they are dirty, and invalidate the + * cpu FPSIMD regs so that they can't be spuriously reused if this vcpu + * disappears and another task or vcpu appears that recycles the same + * struct fpsimd_state. + */ +void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + local_irq_save(flags); + + if (guest_owns_fp_regs()) { + /* + * Flush (save and invalidate) the fpsimd/sve state so that if + * the host tries to use fpsimd/sve, it's not using stale data + * from the guest. + * + * Flushing the state sets the TIF_FOREIGN_FPSTATE bit for the + * context unconditionally, in both nVHE and VHE. This allows + * the kernel to restore the fpsimd/sve state, including ZCR_EL1 + * when needed. + */ + fpsimd_save_and_flush_cpu_state(); + } + + local_irq_restore(flags); +} diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 2c3ff67a8ecb..1c87699fd886 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> @@ -5,41 +6,66 @@ * Derived from arch/arm/kvm/guest.c: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/bits.h> #include <linux/errno.h> #include <linux/err.h> +#include <linux/nospec.h> #include <linux/kvm_host.h> #include <linux/module.h> +#include <linux/stddef.h> +#include <linux/string.h> #include <linux/vmalloc.h> #include <linux/fs.h> +#include <kvm/arm_hypercalls.h> #include <asm/cputype.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> +#include <asm/fpsimd.h> #include <asm/kvm.h> -#include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> -#include <asm/kvm_coproc.h> +#include <asm/kvm_nested.h> +#include <asm/sigcontext.h> + +#include "trace.h" + +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS() +}; + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, hvc_exit_stat), + STATS_DESC_COUNTER(VCPU, wfe_exit_stat), + STATS_DESC_COUNTER(VCPU, wfi_exit_stat), + STATS_DESC_COUNTER(VCPU, mmio_exit_user), + STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), + STATS_DESC_COUNTER(VCPU, signal_exits), + STATS_DESC_COUNTER(VCPU, exits) +}; -struct kvm_stats_debugfs_item debugfs_entries[] = { - { NULL } +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), }; -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +static bool core_reg_offset_is_vreg(u64 off) { - vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; - return 0; + return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) && + off < KVM_REG_ARM_CORE_REG(fp_regs.fpsr); } static u64 core_reg_offset_from_id(u64 id) @@ -47,6 +73,116 @@ static u64 core_reg_offset_from_id(u64 id) return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); } +static int core_reg_size_from_offset(const struct kvm_vcpu *vcpu, u64 off) +{ + int size; + + switch (off) { + case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... + KVM_REG_ARM_CORE_REG(regs.regs[30]): + case KVM_REG_ARM_CORE_REG(regs.sp): + case KVM_REG_ARM_CORE_REG(regs.pc): + case KVM_REG_ARM_CORE_REG(regs.pstate): + case KVM_REG_ARM_CORE_REG(sp_el1): + case KVM_REG_ARM_CORE_REG(elr_el1): + case KVM_REG_ARM_CORE_REG(spsr[0]) ... + KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): + size = sizeof(__u64); + break; + + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + size = sizeof(__uint128_t); + break; + + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + size = sizeof(__u32); + break; + + default: + return -EINVAL; + } + + if (!IS_ALIGNED(off, size / sizeof(__u32))) + return -EINVAL; + + /* + * The KVM_REG_ARM64_SVE regs must be used instead of + * KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on + * SVE-enabled vcpus: + */ + if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(off)) + return -EINVAL; + + return size; +} + +static void *core_reg_addr(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + u64 off = core_reg_offset_from_id(reg->id); + int size = core_reg_size_from_offset(vcpu, off); + + if (size < 0) + return NULL; + + if (KVM_REG_SIZE(reg->id) != size) + return NULL; + + switch (off) { + case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... + KVM_REG_ARM_CORE_REG(regs.regs[30]): + off -= KVM_REG_ARM_CORE_REG(regs.regs[0]); + off /= 2; + return &vcpu->arch.ctxt.regs.regs[off]; + + case KVM_REG_ARM_CORE_REG(regs.sp): + return &vcpu->arch.ctxt.regs.sp; + + case KVM_REG_ARM_CORE_REG(regs.pc): + return &vcpu->arch.ctxt.regs.pc; + + case KVM_REG_ARM_CORE_REG(regs.pstate): + return &vcpu->arch.ctxt.regs.pstate; + + case KVM_REG_ARM_CORE_REG(sp_el1): + return __ctxt_sys_reg(&vcpu->arch.ctxt, SP_EL1); + + case KVM_REG_ARM_CORE_REG(elr_el1): + return __ctxt_sys_reg(&vcpu->arch.ctxt, ELR_EL1); + + case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_EL1]): + return __ctxt_sys_reg(&vcpu->arch.ctxt, SPSR_EL1); + + case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_ABT]): + return &vcpu->arch.ctxt.spsr_abt; + + case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_UND]): + return &vcpu->arch.ctxt.spsr_und; + + case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_IRQ]): + return &vcpu->arch.ctxt.spsr_irq; + + case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_FIQ]): + return &vcpu->arch.ctxt.spsr_fiq; + + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + off -= KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]); + off /= 4; + return &vcpu->arch.ctxt.fp_regs.vregs[off]; + + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + return &vcpu->arch.ctxt.fp_regs.fpsr; + + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + return &vcpu->arch.ctxt.fp_regs.fpcr; + + default: + return NULL; + } +} + static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* @@ -56,8 +192,8 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) * off the index in the "array". */ __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr; - struct kvm_regs *regs = vcpu_gp_regs(vcpu); - int nr_regs = sizeof(*regs) / sizeof(__u32); + int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32); + void *addr; u32 off; /* Our ID is an index into the kvm_regs struct. */ @@ -66,7 +202,11 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; - if (copy_to_user(uaddr, ((u32 *)regs) + off, KVM_REG_SIZE(reg->id))) + addr = core_reg_addr(vcpu, reg); + if (!addr) + return -EINVAL; + + if (copy_to_user(uaddr, addr, KVM_REG_SIZE(reg->id))) return -EFAULT; return 0; @@ -75,10 +215,9 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr; - struct kvm_regs *regs = vcpu_gp_regs(vcpu); - int nr_regs = sizeof(*regs) / sizeof(__u32); + int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32); __uint128_t tmp; - void *valp = &tmp; + void *valp = &tmp, *addr; u64 off; int err = 0; @@ -88,6 +227,10 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; + addr = core_reg_addr(vcpu, reg); + if (!addr) + return -EINVAL; + if (KVM_REG_SIZE(reg->id) > sizeof(tmp)) return -EINVAL; @@ -97,17 +240,31 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) } if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) { - u32 mode = (*(u32 *)valp) & COMPAT_PSR_MODE_MASK; + u64 mode = (*(u64 *)valp) & PSR_AA32_MODE_MASK; switch (mode) { - case COMPAT_PSR_MODE_USR: - case COMPAT_PSR_MODE_FIQ: - case COMPAT_PSR_MODE_IRQ: - case COMPAT_PSR_MODE_SVC: - case COMPAT_PSR_MODE_ABT: - case COMPAT_PSR_MODE_UND: + case PSR_AA32_MODE_USR: + if (!kvm_supports_32bit_el0()) + return -EINVAL; + break; + case PSR_AA32_MODE_FIQ: + case PSR_AA32_MODE_IRQ: + case PSR_AA32_MODE_SVC: + case PSR_AA32_MODE_ABT: + case PSR_AA32_MODE_UND: + case PSR_AA32_MODE_SYS: + if (!vcpu_el1_is_32bit(vcpu)) + return -EINVAL; + break; + case PSR_MODE_EL2h: + case PSR_MODE_EL2t: + if (!vcpu_has_nv(vcpu)) + return -EINVAL; + fallthrough; case PSR_MODE_EL0t: case PSR_MODE_EL1t: case PSR_MODE_EL1h: + if (vcpu_el1_is_32bit(vcpu)) + return -EINVAL; break; default: err = -EINVAL; @@ -115,11 +272,267 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) } } - memcpy((u32 *)regs + off, valp, KVM_REG_SIZE(reg->id)); + memcpy(addr, valp, KVM_REG_SIZE(reg->id)); + + if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { + int i, nr_reg; + + switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) { + /* + * Either we are dealing with user mode, and only the + * first 15 registers (+ PC) must be narrowed to 32bit. + * AArch32 r0-r14 conveniently map to AArch64 x0-x14. + */ + case PSR_AA32_MODE_USR: + case PSR_AA32_MODE_SYS: + nr_reg = 15; + break; + + /* + * Otherwise, this is a privileged mode, and *all* the + * registers must be narrowed to 32bit. + */ + default: + nr_reg = 31; + break; + } + + for (i = 0; i < nr_reg; i++) + vcpu_set_reg(vcpu, i, (u32)vcpu_get_reg(vcpu, i)); + + *vcpu_pc(vcpu) = (u32)*vcpu_pc(vcpu); + } out: return err; } +#define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64) +#define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64) +#define vq_present(vqs, vq) (!!((vqs)[vq_word(vq)] & vq_mask(vq))) + +static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + unsigned int max_vq, vq; + u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; + + if (!vcpu_has_sve(vcpu)) + return -ENOENT; + + if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl))) + return -EINVAL; + + memset(vqs, 0, sizeof(vqs)); + + max_vq = vcpu_sve_max_vq(vcpu); + for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) + if (sve_vq_available(vq)) + vqs[vq_word(vq)] |= vq_mask(vq); + + if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs))) + return -EFAULT; + + return 0; +} + +static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + unsigned int max_vq, vq; + u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; + + if (!vcpu_has_sve(vcpu)) + return -ENOENT; + + if (kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; /* too late! */ + + if (WARN_ON(vcpu->arch.sve_state)) + return -EINVAL; + + if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs))) + return -EFAULT; + + max_vq = 0; + for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq) + if (vq_present(vqs, vq)) + max_vq = vq; + + if (max_vq > sve_vq_from_vl(kvm_sve_max_vl)) + return -EINVAL; + + /* + * Vector lengths supported by the host can't currently be + * hidden from the guest individually: instead we can only set a + * maximum via ZCR_EL2.LEN. So, make sure the available vector + * lengths match the set requested exactly up to the requested + * maximum: + */ + for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) + if (vq_present(vqs, vq) != sve_vq_available(vq)) + return -EINVAL; + + /* Can't run with no vector lengths at all: */ + if (max_vq < SVE_VQ_MIN) + return -EINVAL; + + /* vcpu->arch.sve_state will be alloc'd by kvm_vcpu_finalize_sve() */ + vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq); + + return 0; +} + +#define SVE_REG_SLICE_SHIFT 0 +#define SVE_REG_SLICE_BITS 5 +#define SVE_REG_ID_SHIFT (SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS) +#define SVE_REG_ID_BITS 5 + +#define SVE_REG_SLICE_MASK \ + GENMASK(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS - 1, \ + SVE_REG_SLICE_SHIFT) +#define SVE_REG_ID_MASK \ + GENMASK(SVE_REG_ID_SHIFT + SVE_REG_ID_BITS - 1, SVE_REG_ID_SHIFT) + +#define SVE_NUM_SLICES (1 << SVE_REG_SLICE_BITS) + +#define KVM_SVE_ZREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_ZREG(0, 0)) +#define KVM_SVE_PREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_PREG(0, 0)) + +/* + * Number of register slices required to cover each whole SVE register. + * NOTE: Only the first slice every exists, for now. + * If you are tempted to modify this, you must also rework sve_reg_to_region() + * to match: + */ +#define vcpu_sve_slices(vcpu) 1 + +/* Bounds of a single SVE register slice within vcpu->arch.sve_state */ +struct sve_state_reg_region { + unsigned int koffset; /* offset into sve_state in kernel memory */ + unsigned int klen; /* length in kernel memory */ + unsigned int upad; /* extra trailing padding in user memory */ +}; + +/* + * Validate SVE register ID and get sanitised bounds for user/kernel SVE + * register copy + */ +static int sve_reg_to_region(struct sve_state_reg_region *region, + struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + /* reg ID ranges for Z- registers */ + const u64 zreg_id_min = KVM_REG_ARM64_SVE_ZREG(0, 0); + const u64 zreg_id_max = KVM_REG_ARM64_SVE_ZREG(SVE_NUM_ZREGS - 1, + SVE_NUM_SLICES - 1); + + /* reg ID ranges for P- registers and FFR (which are contiguous) */ + const u64 preg_id_min = KVM_REG_ARM64_SVE_PREG(0, 0); + const u64 preg_id_max = KVM_REG_ARM64_SVE_FFR(SVE_NUM_SLICES - 1); + + unsigned int vq; + unsigned int reg_num; + + unsigned int reqoffset, reqlen; /* User-requested offset and length */ + unsigned int maxlen; /* Maximum permitted length */ + + size_t sve_state_size; + + const u64 last_preg_id = KVM_REG_ARM64_SVE_PREG(SVE_NUM_PREGS - 1, + SVE_NUM_SLICES - 1); + + /* Verify that the P-regs and FFR really do have contiguous IDs: */ + BUILD_BUG_ON(KVM_REG_ARM64_SVE_FFR(0) != last_preg_id + 1); + + /* Verify that we match the UAPI header: */ + BUILD_BUG_ON(SVE_NUM_SLICES != KVM_ARM64_SVE_MAX_SLICES); + + reg_num = (reg->id & SVE_REG_ID_MASK) >> SVE_REG_ID_SHIFT; + + if (reg->id >= zreg_id_min && reg->id <= zreg_id_max) { + if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) + return -ENOENT; + + vq = vcpu_sve_max_vq(vcpu); + + reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) - + SVE_SIG_REGS_OFFSET; + reqlen = KVM_SVE_ZREG_SIZE; + maxlen = SVE_SIG_ZREG_SIZE(vq); + } else if (reg->id >= preg_id_min && reg->id <= preg_id_max) { + if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) + return -ENOENT; + + vq = vcpu_sve_max_vq(vcpu); + + reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) - + SVE_SIG_REGS_OFFSET; + reqlen = KVM_SVE_PREG_SIZE; + maxlen = SVE_SIG_PREG_SIZE(vq); + } else { + return -EINVAL; + } + + sve_state_size = vcpu_sve_state_size(vcpu); + if (WARN_ON(!sve_state_size)) + return -EINVAL; + + region->koffset = array_index_nospec(reqoffset, sve_state_size); + region->klen = min(maxlen, reqlen); + region->upad = reqlen - region->klen; + + return 0; +} + +static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + int ret; + struct sve_state_reg_region region; + char __user *uptr = (char __user *)reg->addr; + + /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ + if (reg->id == KVM_REG_ARM64_SVE_VLS) + return get_sve_vls(vcpu, reg); + + /* Try to interpret reg ID as an architectural SVE register... */ + ret = sve_reg_to_region(®ion, vcpu, reg); + if (ret) + return ret; + + if (!kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; + + if (copy_to_user(uptr, vcpu->arch.sve_state + region.koffset, + region.klen) || + clear_user(uptr + region.klen, region.upad)) + return -EFAULT; + + return 0; +} + +static int set_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + int ret; + struct sve_state_reg_region region; + const char __user *uptr = (const char __user *)reg->addr; + + /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ + if (reg->id == KVM_REG_ARM64_SVE_VLS) + return set_sve_vls(vcpu, reg); + + /* Try to interpret reg ID as an architectural SVE register... */ + ret = sve_reg_to_region(®ion, vcpu, reg); + if (ret) + return ret; + + if (!kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; + + if (copy_from_user(vcpu->arch.sve_state + region.koffset, uptr, + region.klen)) + return -EFAULT; + + return 0; +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { return -EINVAL; @@ -130,36 +543,158 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return -EINVAL; } -static unsigned long num_core_regs(void) +static int copy_core_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + unsigned int i; + int n = 0; + + for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { + u64 reg = KVM_REG_ARM64 | KVM_REG_ARM_CORE | i; + int size = core_reg_size_from_offset(vcpu, i); + + if (size < 0) + continue; + + switch (size) { + case sizeof(__u32): + reg |= KVM_REG_SIZE_U32; + break; + + case sizeof(__u64): + reg |= KVM_REG_SIZE_U64; + break; + + case sizeof(__uint128_t): + reg |= KVM_REG_SIZE_U128; + break; + + default: + WARN_ON(1); + continue; + } + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + + n++; + } + + return n; +} + +static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) +{ + return copy_core_reg_indices(vcpu, NULL); +} + +static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) { - return sizeof(struct kvm_regs) / sizeof(__u32); + const unsigned int slices = vcpu_sve_slices(vcpu); + + if (!vcpu_has_sve(vcpu)) + return 0; + + /* Policed by KVM_GET_REG_LIST: */ + WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); + + return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */) + + 1; /* KVM_REG_ARM64_SVE_VLS */ +} + +static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + const unsigned int slices = vcpu_sve_slices(vcpu); + u64 reg; + unsigned int i, n; + int num_regs = 0; + + if (!vcpu_has_sve(vcpu)) + return 0; + + /* Policed by KVM_GET_REG_LIST: */ + WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); + + /* + * Enumerate this first, so that userspace can save/restore in + * the order reported by KVM_GET_REG_LIST: + */ + reg = KVM_REG_ARM64_SVE_VLS; + if (put_user(reg, uindices++)) + return -EFAULT; + ++num_regs; + + for (i = 0; i < slices; i++) { + for (n = 0; n < SVE_NUM_ZREGS; n++) { + reg = KVM_REG_ARM64_SVE_ZREG(n, i); + if (put_user(reg, uindices++)) + return -EFAULT; + num_regs++; + } + + for (n = 0; n < SVE_NUM_PREGS; n++) { + reg = KVM_REG_ARM64_SVE_PREG(n, i); + if (put_user(reg, uindices++)) + return -EFAULT; + num_regs++; + } + + reg = KVM_REG_ARM64_SVE_FFR(i); + if (put_user(reg, uindices++)) + return -EFAULT; + num_regs++; + } + + return num_regs; } /** * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG + * @vcpu: the vCPU pointer * * This is for all registers. */ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) { - return num_core_regs() + kvm_arm_num_sys_reg_descs(vcpu); + unsigned long res = 0; + + res += num_core_regs(vcpu); + res += num_sve_regs(vcpu); + res += kvm_arm_num_sys_reg_descs(vcpu); + res += kvm_arm_get_fw_num_regs(vcpu); + + return res; } /** * kvm_arm_copy_reg_indices - get indices of all registers. + * @vcpu: the vCPU pointer + * @uindices: register list to copy * - * We do core registers right here, then we apppend system regs. + * We do core registers right here, then we append system regs. */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - unsigned int i; - const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE; + int ret; - for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { - if (put_user(core_reg | i, uindices)) - return -EFAULT; - uindices++; - } + ret = copy_core_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = copy_sve_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += kvm_arm_get_fw_num_regs(vcpu); return kvm_arm_copy_sys_reg_indices(vcpu, uindices); } @@ -170,9 +705,13 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; - /* Register group 16 means we want a core register. */ - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) - return get_core_reg(vcpu, reg); + switch (reg->id & KVM_REG_ARM_COPROC_MASK) { + case KVM_REG_ARM_CORE: return get_core_reg(vcpu, reg); + case KVM_REG_ARM_FW: + case KVM_REG_ARM_FW_FEAT_BMAP: + return kvm_arm_get_fw_reg(vcpu, reg); + case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg); + } return kvm_arm_sys_reg_get_reg(vcpu, reg); } @@ -183,9 +722,13 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; - /* Register group 16 means we set a core register. */ - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) - return set_core_reg(vcpu, reg); + switch (reg->id & KVM_REG_ARM_COPROC_MASK) { + case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg); + case KVM_REG_ARM_FW: + case KVM_REG_ARM_FW_FEAT_BMAP: + return kvm_arm_set_fw_reg(vcpu, reg); + case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg); + } return kvm_arm_sys_reg_set_reg(vcpu, reg); } @@ -202,50 +745,111 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return -EINVAL; } -int __attribute_const__ kvm_target_cpu(void) +int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) { - unsigned long implementor = read_cpuid_implementor(); - unsigned long part_number = read_cpuid_part_number(); + events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); + events->exception.serror_pending = (vcpu->arch.hcr_el2 & HCR_VSE) || + vcpu_get_flag(vcpu, NESTED_SERROR_PENDING); - if (implementor != ARM_CPU_IMP_ARM) - return -EINVAL; + if (events->exception.serror_pending && events->exception.serror_has_esr) + events->exception.serror_esr = vcpu_get_vsesr(vcpu); - switch (part_number) { - case ARM_CPU_PART_AEM_V8: - return KVM_ARM_TARGET_AEM_V8; - case ARM_CPU_PART_FOUNDATION: - return KVM_ARM_TARGET_FOUNDATION_V8; - case ARM_CPU_PART_CORTEX_A57: - /* Currently handled by the generic backend */ - return KVM_ARM_TARGET_CORTEX_A57; - default: - return -EINVAL; - } + /* + * We never return a pending ext_dabt here because we deliver it to + * the virtual CPU directly when setting the event and it's no longer + * 'pending' at this point. + */ + + return 0; } -int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, - const struct kvm_vcpu_init *init) +static void commit_pending_events(struct kvm_vcpu *vcpu) { - unsigned int i; - int phys_target = kvm_target_cpu(); + if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) + return; + + /* + * Reset the MMIO emulation state to avoid stepping PC after emulating + * the exception entry. + */ + vcpu->mmio_needed = false; + kvm_call_hyp(__kvm_adjust_pc, vcpu); +} - if (init->target != phys_target) +int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + bool serror_pending = events->exception.serror_pending; + bool has_esr = events->exception.serror_has_esr; + bool ext_dabt_pending = events->exception.ext_dabt_pending; + u64 esr = events->exception.serror_esr; + int ret = 0; + + /* + * Immediately commit the pending SEA to the vCPU's architectural + * state which is necessary since we do not return a pending SEA + * to userspace via KVM_GET_VCPU_EVENTS. + */ + if (ext_dabt_pending) { + ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + commit_pending_events(vcpu); + } + + if (ret < 0) + return ret; + + if (!serror_pending) + return 0; + + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && has_esr) return -EINVAL; - vcpu->arch.target = phys_target; - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + if (has_esr && (esr & ~ESR_ELx_ISS_MASK)) + return -EINVAL; + + if (has_esr) + ret = kvm_inject_serror_esr(vcpu, esr); + else + ret = kvm_inject_serror(vcpu); - /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ - for (i = 0; i < sizeof(init->features) * 8; i++) { - if (init->features[i / 32] & (1 << (i % 32))) { - if (i >= KVM_VCPU_MAX_FEATURES) - return -ENOENT; - set_bit(i, vcpu->arch.features); + /* + * We could've decided that the SError is due for immediate software + * injection; commit the exception in case userspace decides it wants + * to inject more exceptions for some strange reason. + */ + commit_pending_events(vcpu); + return (ret < 0) ? ret : 0; +} + +u32 __attribute_const__ kvm_target_cpu(void) +{ + unsigned long implementor = read_cpuid_implementor(); + unsigned long part_number = read_cpuid_part_number(); + + switch (implementor) { + case ARM_CPU_IMP_ARM: + switch (part_number) { + case ARM_CPU_PART_AEM_V8: + return KVM_ARM_TARGET_AEM_V8; + case ARM_CPU_PART_FOUNDATION: + return KVM_ARM_TARGET_FOUNDATION_V8; + case ARM_CPU_PART_CORTEX_A53: + return KVM_ARM_TARGET_CORTEX_A53; + case ARM_CPU_PART_CORTEX_A57: + return KVM_ARM_TARGET_CORTEX_A57; + } + break; + case ARM_CPU_IMP_APM: + switch (part_number) { + case APM_CPU_PART_XGENE: + return KVM_ARM_TARGET_XGENE_POTENZA; } + break; } - /* Now we know what it is, we can reset it. */ - return kvm_reset_vcpu(vcpu); + /* Return a default generic target */ + return KVM_ARM_TARGET_GENERIC_V8; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) @@ -263,3 +867,215 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { return -EINVAL; } + +/** + * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging + * @vcpu: the vCPU pointer + * @dbg: the ioctl data buffer + * + * This sets up and enables the VM for guest debugging. Userspace + * passes in a control flag to enable different debug types and + * potentially other architecture specific information in the rest of + * the structure. + */ +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + trace_kvm_set_guest_debug(vcpu, dbg->control); + + if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) + return -EINVAL; + + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { + vcpu->guest_debug = 0; + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); + return 0; + } + + vcpu->guest_debug = dbg->control; + + /* Hardware assisted Break and Watch points */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + vcpu->arch.external_debug_state = dbg->arch; + + return 0; +} + +int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_ARM_VCPU_PMU_V3_CTRL: + mutex_lock(&vcpu->kvm->arch.config_lock); + ret = kvm_arm_pmu_v3_set_attr(vcpu, attr); + mutex_unlock(&vcpu->kvm->arch.config_lock); + break; + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_set_attr(vcpu, attr); + break; + case KVM_ARM_VCPU_PVTIME_CTRL: + ret = kvm_arm_pvtime_set_attr(vcpu, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_ARM_VCPU_PMU_V3_CTRL: + ret = kvm_arm_pmu_v3_get_attr(vcpu, attr); + break; + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_get_attr(vcpu, attr); + break; + case KVM_ARM_VCPU_PVTIME_CTRL: + ret = kvm_arm_pvtime_get_attr(vcpu, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_ARM_VCPU_PMU_V3_CTRL: + ret = kvm_arm_pmu_v3_has_attr(vcpu, attr); + break; + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_has_attr(vcpu, attr); + break; + case KVM_ARM_VCPU_PVTIME_CTRL: + ret = kvm_arm_pvtime_has_attr(vcpu, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, + struct kvm_arm_copy_mte_tags *copy_tags) +{ + gpa_t guest_ipa = copy_tags->guest_ipa; + size_t length = copy_tags->length; + void __user *tags = copy_tags->addr; + gpa_t gfn; + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); + int ret = 0; + + if (!kvm_has_mte(kvm)) + return -EINVAL; + + if (copy_tags->reserved[0] || copy_tags->reserved[1]) + return -EINVAL; + + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) + return -EINVAL; + + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) + return -EINVAL; + + /* Lengths above INT_MAX cannot be represented in the return value */ + if (length > INT_MAX) + return -EINVAL; + + gfn = gpa_to_gfn(guest_ipa); + + mutex_lock(&kvm->slots_lock); + + if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) { + ret = -EBUSY; + goto out; + } + + while (length > 0) { + struct page *page = __gfn_to_page(kvm, gfn, write); + void *maddr; + unsigned long num_tags; + struct folio *folio; + + if (!page) { + ret = -EFAULT; + goto out; + } + + if (!pfn_to_online_page(page_to_pfn(page))) { + /* Reject ZONE_DEVICE memory */ + kvm_release_page_unused(page); + ret = -EFAULT; + goto out; + } + folio = page_folio(page); + maddr = page_address(page); + + if (!write) { + if ((folio_test_hugetlb(folio) && + folio_test_hugetlb_mte_tagged(folio)) || + page_mte_tagged(page)) + num_tags = mte_copy_tags_to_user(tags, maddr, + MTE_GRANULES_PER_PAGE); + else + /* No tags in memory, so write zeros */ + num_tags = MTE_GRANULES_PER_PAGE - + clear_user(tags, MTE_GRANULES_PER_PAGE); + kvm_release_page_clean(page); + } else { + /* + * Only locking to serialise with a concurrent + * __set_ptes() in the VMM but still overriding the + * tags, hence ignoring the return value. + */ + if (folio_test_hugetlb(folio)) + folio_try_hugetlb_mte_tagging(folio); + else + try_page_mte_tagging(page); + num_tags = mte_copy_tags_from_user(maddr, tags, + MTE_GRANULES_PER_PAGE); + + /* uaccess failed, don't leave stale tags */ + if (num_tags != MTE_GRANULES_PER_PAGE) + mte_clear_page_tags(maddr); + if (folio_test_hugetlb(folio)) + folio_set_hugetlb_mte_tagged(folio); + else + set_page_mte_tagged(page); + + kvm_release_page_dirty(page); + } + + if (num_tags != MTE_GRANULES_PER_PAGE) { + ret = -EFAULT; + goto out; + } + + gfn++; + tags += num_tags; + length -= PAGE_SIZE; + } + +out: + mutex_unlock(&kvm->slots_lock); + /* If some data has been copied report the number of bytes copied */ + if (length != copy_tags->length) + return copy_tags->length - length; + return ret; +} diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 9beaca033437..cc7d5d1709cb 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> @@ -5,116 +6,478 @@ * Derived from arch/arm/kvm/handle_exit.c: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/ubsan.h> + +#include <asm/esr.h> +#include <asm/exception.h> +#include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> -#include <asm/kvm_coproc.h> #include <asm/kvm_mmu.h> -#include <asm/kvm_psci.h> +#include <asm/kvm_nested.h> +#include <asm/debug-monitors.h> +#include <asm/stacktrace/nvhe.h> +#include <asm/traps.h> + +#include <kvm/arm_hypercalls.h> + +#define CREATE_TRACE_POINTS +#include "trace_handle_exit.h" -typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); +typedef int (*exit_handle_fn)(struct kvm_vcpu *); -static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) +static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr) { - if (kvm_psci_call(vcpu)) + if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr)) + kvm_inject_serror(vcpu); +} + +static int handle_hvc(struct kvm_vcpu *vcpu) +{ + trace_kvm_hvc_arm64(*vcpu_pc(vcpu), vcpu_get_reg(vcpu, 0), + kvm_vcpu_hvc_get_imm(vcpu)); + vcpu->stat.hvc_exit_stat++; + + /* Forward hvc instructions to the virtual EL2 if the guest has EL2. */ + if (vcpu_has_nv(vcpu)) { + if (vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_HCD) + kvm_inject_undefined(vcpu); + else + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return 1; + } - kvm_inject_undefined(vcpu); - return 1; + return kvm_smccc_call_handler(vcpu); } -static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_smc(struct kvm_vcpu *vcpu) { - if (kvm_psci_call(vcpu)) + /* + * Forward this trapped smc instruction to the virtual EL2 if + * the guest has asked for it. + */ + if (forward_smc_trap(vcpu)) + return 1; + + /* + * "If an SMC instruction executed at Non-secure EL1 is + * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a + * Trap exception, not a Secure Monitor Call exception [...]" + * + * We need to advance the PC after the trap, as it would + * otherwise return to the same address. Furthermore, pre-incrementing + * the PC before potentially exiting to userspace maintains the same + * abstraction for both SMCs and HVCs. + */ + kvm_incr_pc(vcpu); + + /* + * SMCs with a nonzero immediate are reserved according to DEN0028E 2.9 + * "SMC and HVC immediate value". + */ + if (kvm_vcpu_hvc_get_imm(vcpu)) { + vcpu_set_reg(vcpu, 0, ~0UL); return 1; + } + /* + * If imm is zero then it is likely an SMCCC call. + * + * Note that on ARMv8.3, even if EL3 is not implemented, SMC executed + * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than + * being treated as UNDEFINED. + */ + return kvm_smccc_call_handler(vcpu); +} + +/* + * This handles the cases where the system does not support FP/ASIMD or when + * we are running nested virtualization and the guest hypervisor is trapping + * FP/ASIMD accesses by its guest guest. + * + * All other handling of guest vs. host FP/ASIMD register state is handled in + * fixup_guest_exit(). + */ +static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu) +{ + if (guest_hyp_fpsimd_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + /* This is the case when the system doesn't support FP/ASIMD. */ kvm_inject_undefined(vcpu); return 1; } /** - * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest + * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event + * instruction executed by a guest + * * @vcpu: the vcpu pointer * - * Simply call kvm_vcpu_block(), which will halt execution of + * WFE[T]: Yield the CPU and come back to this vcpu when the scheduler + * decides to. + * WFI: Simply call kvm_vcpu_halt(), which will halt execution of * world-switches and schedule other host processes until there is an * incoming IRQ or FIQ to the VM. + * WFIT: Same as WFI, with a timed wakeup implemented as a background timer + * + * WF{I,E}T can immediately return if the deadline has already expired. */ -static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_wfx(struct kvm_vcpu *vcpu) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE); + + if (guest_hyp_wfx_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + if (is_wfe) { + trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); + vcpu->stat.wfe_exit_stat++; + } else { + trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); + vcpu->stat.wfi_exit_stat++; + } + + if (esr & ESR_ELx_WFx_ISS_WFxT) { + if (esr & ESR_ELx_WFx_ISS_RV) { + u64 val, now; + + now = kvm_phys_timer_read(); + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + now -= timer_get_offset(vcpu_hvtimer(vcpu)); + else + now -= timer_get_offset(vcpu_vtimer(vcpu)); + + val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + + if (now >= val) + goto out; + } else { + /* Treat WFxT as WFx if RN is invalid */ + esr &= ~ESR_ELx_WFx_ISS_WFxT; + } + } + + if (esr & ESR_ELx_WFx_ISS_WFE) { + kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); + } else { + if (esr & ESR_ELx_WFx_ISS_WFxT) + vcpu_set_flag(vcpu, IN_WFIT); + + kvm_vcpu_wfi(vcpu); + } +out: + kvm_incr_pc(vcpu); + + return 1; +} + +/** + * kvm_handle_guest_debug - handle a debug exception instruction + * + * @vcpu: the vcpu pointer + * + * We route all debug exceptions through the same handler. If both the + * guest and host are using the same debug facilities it will be up to + * userspace to re-inject the correct exception for guest delivery. + * + * @return: 0 (while setting vcpu->run->exit_reason) + */ +static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + u64 esr = kvm_vcpu_get_esr(vcpu); + + if (!vcpu->guest_debug && forward_debug_exception(vcpu)) + return 1; + + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.hsr = lower_32_bits(esr); + run->debug.arch.hsr_high = upper_32_bits(esr); + run->flags = KVM_DEBUG_ARCH_HSR_HIGH_VALID; + + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_WATCHPT_LOW: + run->debug.arch.far = vcpu->arch.fault.far_el2; + break; + case ESR_ELx_EC_SOFTSTP_LOW: + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + break; + } + + return 0; +} + +static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + + kvm_pr_unimpl("Unknown exception class: esr: %#016llx -- %s\n", + esr, esr_get_class_string(esr)); + + kvm_inject_undefined(vcpu); + return 1; +} + +/* + * Guest access to SVE registers should be routed to this handler only + * when the system doesn't support SVE. + */ +static int handle_sve(struct kvm_vcpu *vcpu) +{ + if (guest_hyp_sve_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + kvm_inject_undefined(vcpu); + return 1; +} + +/* + * Two possibilities to handle a trapping ptrauth instruction: + * + * - Guest usage of a ptrauth instruction (which the guest EL1 did not + * turn into a NOP). If we get here, it is because we didn't enable + * ptrauth for the guest. This results in an UNDEF, as it isn't + * supposed to use ptrauth without being told it could. + * + * - Running an L2 NV guest while L1 has left HCR_EL2.API==0, and for + * which we reinject the exception into L1. + * + * Anything else is an emulation bug (hence the WARN_ON + UNDEF). + */ +static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu) +{ + if (!vcpu_has_ptrauth(vcpu)) { + kvm_inject_undefined(vcpu); + return 1; + } + + if (is_nested_ctxt(vcpu)) { + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return 1; + } + + /* Really shouldn't be here! */ + WARN_ON_ONCE(1); + kvm_inject_undefined(vcpu); + return 1; +} + +static int kvm_handle_eret(struct kvm_vcpu *vcpu) { - kvm_vcpu_block(vcpu); + if (esr_iss_is_eretax(kvm_vcpu_get_esr(vcpu)) && + !vcpu_has_ptrauth(vcpu)) + return kvm_handle_ptrauth(vcpu); + + /* + * If we got here, two possibilities: + * + * - the guest is in EL2, and we need to fully emulate ERET + * + * - the guest is in EL1, and we need to reinject the + * exception into the L1 hypervisor. + * + * If KVM ever traps ERET for its own use, we'll have to + * revisit this. + */ + if (is_hyp_ctxt(vcpu)) + kvm_emulate_nested_eret(vcpu); + else + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + return 1; +} + +static int handle_svc(struct kvm_vcpu *vcpu) +{ + /* + * So far, SVC traps only for NV via HFGITR_EL2. A SVC from a + * 32bit guest would be caught by vpcu_mode_is_bad_32bit(), so + * we should only have to deal with a 64 bit exception. + */ + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return 1; +} + +static int kvm_handle_gcs(struct kvm_vcpu *vcpu) +{ + /* We don't expect GCS, so treat it with contempt */ + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, GCS, IMP)) + WARN_ON_ONCE(1); + + kvm_inject_undefined(vcpu); + return 1; +} + +static int handle_other(struct kvm_vcpu *vcpu) +{ + bool allowed, fwd = is_nested_ctxt(vcpu); + u64 hcrx = __vcpu_sys_reg(vcpu, HCRX_EL2); + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 iss = ESR_ELx_ISS(esr); + struct kvm *kvm = vcpu->kvm; + + /* + * We only trap for two reasons: + * + * - the feature is disabled, and the only outcome is to + * generate an UNDEF. + * + * - the feature is enabled, but a NV guest wants to trap the + * feature used by its L2 guest. We forward the exception in + * this case. + * + * What we don't expect is to end-up here if the guest is + * expected be be able to directly use the feature, hence the + * WARN_ON below. + */ + switch (iss) { + case ESR_ELx_ISS_OTHER_ST64BV: + allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V); + fwd &= !(hcrx & HCRX_EL2_EnASR); + break; + case ESR_ELx_ISS_OTHER_ST64BV0: + allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA); + fwd &= !(hcrx & HCRX_EL2_EnAS0); + break; + case ESR_ELx_ISS_OTHER_LDST64B: + allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64); + fwd &= !(hcrx & HCRX_EL2_EnALS); + break; + case ESR_ELx_ISS_OTHER_TSBCSYNC: + allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1); + fwd &= (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC); + break; + case ESR_ELx_ISS_OTHER_PSBCSYNC: + allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P5); + fwd &= (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC); + break; + default: + /* Clearly, we're missing something. */ + WARN_ON_ONCE(1); + allowed = false; + } + + WARN_ON_ONCE(allowed && !fwd); + + if (allowed && fwd) + kvm_inject_nested_sync(vcpu, esr); + else + kvm_inject_undefined(vcpu); + return 1; } static exit_handle_fn arm_exit_handlers[] = { - [ESR_EL2_EC_WFI] = kvm_handle_wfi, - [ESR_EL2_EC_CP15_32] = kvm_handle_cp15_32, - [ESR_EL2_EC_CP15_64] = kvm_handle_cp15_64, - [ESR_EL2_EC_CP14_MR] = kvm_handle_cp14_access, - [ESR_EL2_EC_CP14_LS] = kvm_handle_cp14_load_store, - [ESR_EL2_EC_CP14_64] = kvm_handle_cp14_access, - [ESR_EL2_EC_HVC32] = handle_hvc, - [ESR_EL2_EC_SMC32] = handle_smc, - [ESR_EL2_EC_HVC64] = handle_hvc, - [ESR_EL2_EC_SMC64] = handle_smc, - [ESR_EL2_EC_SYS64] = kvm_handle_sys_reg, - [ESR_EL2_EC_IABT] = kvm_handle_guest_abort, - [ESR_EL2_EC_DABT] = kvm_handle_guest_abort, + [0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec, + [ESR_ELx_EC_WFx] = kvm_handle_wfx, + [ESR_ELx_EC_CP15_32] = kvm_handle_cp15_32, + [ESR_ELx_EC_CP15_64] = kvm_handle_cp15_64, + [ESR_ELx_EC_CP14_MR] = kvm_handle_cp14_32, + [ESR_ELx_EC_CP14_LS] = kvm_handle_cp14_load_store, + [ESR_ELx_EC_CP10_ID] = kvm_handle_cp10_id, + [ESR_ELx_EC_CP14_64] = kvm_handle_cp14_64, + [ESR_ELx_EC_OTHER] = handle_other, + [ESR_ELx_EC_HVC32] = handle_hvc, + [ESR_ELx_EC_SMC32] = handle_smc, + [ESR_ELx_EC_HVC64] = handle_hvc, + [ESR_ELx_EC_SMC64] = handle_smc, + [ESR_ELx_EC_SVC64] = handle_svc, + [ESR_ELx_EC_SYS64] = kvm_handle_sys_reg, + [ESR_ELx_EC_SVE] = handle_sve, + [ESR_ELx_EC_ERET] = kvm_handle_eret, + [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, + [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, + [ESR_ELx_EC_DABT_CUR] = kvm_handle_vncr_abort, + [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug, + [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug, + [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, + [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, + [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, + [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd, + [ESR_ELx_EC_PAC] = kvm_handle_ptrauth, + [ESR_ELx_EC_GCS] = kvm_handle_gcs, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) { - u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); + u8 esr_ec = ESR_ELx_EC(esr); - if (hsr_ec >= ARRAY_SIZE(arm_exit_handlers) || - !arm_exit_handlers[hsr_ec]) { - kvm_err("Unkown exception class: hsr: %#08x\n", - (unsigned int)kvm_vcpu_get_hsr(vcpu)); - BUG(); + return arm_exit_handlers[esr_ec]; +} + +/* + * We may be single-stepping an emulated instruction. If the emulation + * has been completed in the kernel, we can return to userspace with a + * KVM_EXIT_DEBUG, otherwise userspace needs to complete its + * emulation first. + */ +static int handle_trap_exceptions(struct kvm_vcpu *vcpu) +{ + int handled; + + /* + * See ARM ARM B1.14.1: "Hyp traps on instructions + * that fail their condition code check" + */ + if (!kvm_condition_valid(vcpu)) { + kvm_incr_pc(vcpu); + handled = 1; + } else { + exit_handle_fn exit_handler; + + exit_handler = kvm_get_exit_handler(vcpu); + handled = exit_handler(vcpu); } - return arm_exit_handlers[hsr_ec]; + return handled; } /* * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on * proper exit to userspace. */ -int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index) +int handle_exit(struct kvm_vcpu *vcpu, int exception_index) { - exit_handle_fn exit_handler; + struct kvm_run *run = vcpu->run; + + if (ARM_SERROR_PENDING(exception_index)) { + /* + * The SError is handled by handle_exit_early(). If the guest + * survives it will re-execute the original instruction. + */ + return 1; + } + + exception_index = ARM_EXCEPTION_CODE(exception_index); switch (exception_index) { case ARM_EXCEPTION_IRQ: return 1; + case ARM_EXCEPTION_EL1_SERROR: + return 1; case ARM_EXCEPTION_TRAP: + return handle_trap_exceptions(vcpu); + case ARM_EXCEPTION_HYP_GONE: /* - * See ARM ARM B1.14.1: "Hyp traps on instructions - * that fail their condition code check" + * EL2 has been reset to the hyp-stub. This happens when a guest + * is pre-emptied by kvm_reboot()'s shutdown call. */ - if (!kvm_condition_valid(vcpu)) { - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - return 1; - } - - exit_handler = kvm_get_exit_handler(vcpu); - - return exit_handler(vcpu, run); + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return 0; + case ARM_EXCEPTION_IL: + /* + * We attempted an illegal exception return. Guest state must + * have been corrupted somehow. Give up. + */ + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return -EINVAL; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); @@ -122,3 +485,96 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, return 0; } } + +/* For exit types that need handling before we can be preempted */ +void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) +{ + if (ARM_SERROR_PENDING(exception_index)) { + if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) { + u64 disr = kvm_vcpu_get_disr(vcpu); + + kvm_handle_guest_serror(vcpu, disr_to_esr(disr)); + } else { + kvm_inject_serror(vcpu); + } + + return; + } + + exception_index = ARM_EXCEPTION_CODE(exception_index); + + if (exception_index == ARM_EXCEPTION_EL1_SERROR) + kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); +} + +static void print_nvhe_hyp_panic(const char *name, u64 panic_addr) +{ + kvm_err("nVHE hyp %s at: [<%016llx>] %pB!\n", name, panic_addr, + (void *)(panic_addr + kaslr_offset())); +} + +static void kvm_nvhe_report_cfi_failure(u64 panic_addr) +{ + print_nvhe_hyp_panic("CFI failure", panic_addr); + + if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) + kvm_err(" (CONFIG_CFI_PERMISSIVE ignored for hyp failures)\n"); +} + +void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, + u64 elr_virt, u64 elr_phys, + u64 par, uintptr_t vcpu, + u64 far, u64 hpfar) { + u64 elr_in_kimg = __phys_to_kimg(elr_phys); + u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt; + u64 mode = spsr & PSR_MODE_MASK; + u64 panic_addr = elr_virt + hyp_offset; + + if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) { + kvm_err("Invalid host exception to nVHE hyp!\n"); + } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && + esr_brk_comment(esr) == BUG_BRK_IMM) { + const char *file = NULL; + unsigned int line = 0; + + /* All hyp bugs, including warnings, are treated as fatal. */ + if (!is_protected_kvm_enabled() || + IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + struct bug_entry *bug = find_bug(elr_in_kimg); + + if (bug) + bug_get_file_line(bug, &file, &line); + } + + if (file) + kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); + else + print_nvhe_hyp_panic("BUG", panic_addr); + } else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) { + kvm_nvhe_report_cfi_failure(panic_addr); + } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) && + ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && + esr_is_ubsan_brk(esr)) { + print_nvhe_hyp_panic(report_ubsan_failure(esr & UBSAN_BRK_MASK), + panic_addr); + } else { + print_nvhe_hyp_panic("panic", panic_addr); + } + + /* Dump the nVHE hypervisor backtrace */ + kvm_nvhe_dump_backtrace(hyp_offset); + + /* Dump the faulting instruction */ + dump_kernel_instr(panic_addr + kaslr_offset()); + + /* + * Hyp has panicked and we're going to handle that by panicking the + * kernel. The kernel offset will be revealed in the panic so we're + * also safe to reveal the hyp offset as a debugging aid for translating + * hyp VAs to vmlinux addresses. + */ + kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); + + panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%016llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", + spsr, elr_virt, esr, far, hpfar, par, vcpu); +} diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S deleted file mode 100644 index ba84e6705e20..000000000000 --- a/arch/arm64/kvm/hyp-init.S +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/linkage.h> - -#include <asm/assembler.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_mmu.h> - - .text - .pushsection .hyp.idmap.text, "ax" - - .align 11 - -ENTRY(__kvm_hyp_init) - ventry __invalid // Synchronous EL2t - ventry __invalid // IRQ EL2t - ventry __invalid // FIQ EL2t - ventry __invalid // Error EL2t - - ventry __invalid // Synchronous EL2h - ventry __invalid // IRQ EL2h - ventry __invalid // FIQ EL2h - ventry __invalid // Error EL2h - - ventry __do_hyp_init // Synchronous 64-bit EL1 - ventry __invalid // IRQ 64-bit EL1 - ventry __invalid // FIQ 64-bit EL1 - ventry __invalid // Error 64-bit EL1 - - ventry __invalid // Synchronous 32-bit EL1 - ventry __invalid // IRQ 32-bit EL1 - ventry __invalid // FIQ 32-bit EL1 - ventry __invalid // Error 32-bit EL1 - -__invalid: - b . - - /* - * x0: HYP boot pgd - * x1: HYP pgd - * x2: HYP stack - * x3: HYP vectors - */ -__do_hyp_init: - - msr ttbr0_el2, x0 - - mrs x4, tcr_el1 - ldr x5, =TCR_EL2_MASK - and x4, x4, x5 - ldr x5, =TCR_EL2_FLAGS - orr x4, x4, x5 - msr tcr_el2, x4 - - ldr x4, =VTCR_EL2_FLAGS - msr vtcr_el2, x4 - - mrs x4, mair_el1 - msr mair_el2, x4 - isb - - mov x4, #SCTLR_EL2_FLAGS - msr sctlr_el2, x4 - isb - - /* MMU is now enabled. Get ready for the trampoline dance */ - ldr x4, =TRAMPOLINE_VA - adr x5, target - bfi x4, x5, #0, #PAGE_SHIFT - br x4 - -target: /* We're now in the trampoline code, switch page tables */ - msr ttbr0_el2, x1 - isb - - /* Invalidate the old TLBs */ - tlbi alle2 - dsb sy - - /* Set the stack and new vectors */ - kern_hyp_va x2 - mov sp, x2 - kern_hyp_va x3 - msr vbar_el2, x3 - - /* Hello, World! */ - eret -ENDPROC(__kvm_hyp_init) - - .ltorg - - .popsection diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S deleted file mode 100644 index ff985e3d8b72..000000000000 --- a/arch/arm64/kvm/hyp.S +++ /dev/null @@ -1,831 +0,0 @@ -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/linkage.h> -#include <linux/irqchip/arm-gic.h> - -#include <asm/assembler.h> -#include <asm/memory.h> -#include <asm/asm-offsets.h> -#include <asm/fpsimdmacros.h> -#include <asm/kvm.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_mmu.h> - -#define CPU_GP_REG_OFFSET(x) (CPU_GP_REGS + x) -#define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x) -#define CPU_SPSR_OFFSET(x) CPU_GP_REG_OFFSET(CPU_SPSR + 8*x) -#define CPU_SYSREG_OFFSET(x) (CPU_SYSREGS + 8*x) - - .text - .pushsection .hyp.text, "ax" - .align PAGE_SHIFT - -__kvm_hyp_code_start: - .globl __kvm_hyp_code_start - -.macro save_common_regs - // x2: base address for cpu context - // x3: tmp register - - add x3, x2, #CPU_XREG_OFFSET(19) - stp x19, x20, [x3] - stp x21, x22, [x3, #16] - stp x23, x24, [x3, #32] - stp x25, x26, [x3, #48] - stp x27, x28, [x3, #64] - stp x29, lr, [x3, #80] - - mrs x19, sp_el0 - mrs x20, elr_el2 // EL1 PC - mrs x21, spsr_el2 // EL1 pstate - - stp x19, x20, [x3, #96] - str x21, [x3, #112] - - mrs x22, sp_el1 - mrs x23, elr_el1 - mrs x24, spsr_el1 - - str x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)] - str x23, [x2, #CPU_GP_REG_OFFSET(CPU_ELR_EL1)] - str x24, [x2, #CPU_SPSR_OFFSET(KVM_SPSR_EL1)] -.endm - -.macro restore_common_regs - // x2: base address for cpu context - // x3: tmp register - - ldr x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)] - ldr x23, [x2, #CPU_GP_REG_OFFSET(CPU_ELR_EL1)] - ldr x24, [x2, #CPU_SPSR_OFFSET(KVM_SPSR_EL1)] - - msr sp_el1, x22 - msr elr_el1, x23 - msr spsr_el1, x24 - - add x3, x2, #CPU_XREG_OFFSET(31) // SP_EL0 - ldp x19, x20, [x3] - ldr x21, [x3, #16] - - msr sp_el0, x19 - msr elr_el2, x20 // EL1 PC - msr spsr_el2, x21 // EL1 pstate - - add x3, x2, #CPU_XREG_OFFSET(19) - ldp x19, x20, [x3] - ldp x21, x22, [x3, #16] - ldp x23, x24, [x3, #32] - ldp x25, x26, [x3, #48] - ldp x27, x28, [x3, #64] - ldp x29, lr, [x3, #80] -.endm - -.macro save_host_regs - save_common_regs -.endm - -.macro restore_host_regs - restore_common_regs -.endm - -.macro save_fpsimd - // x2: cpu context address - // x3, x4: tmp regs - add x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS) - fpsimd_save x3, 4 -.endm - -.macro restore_fpsimd - // x2: cpu context address - // x3, x4: tmp regs - add x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS) - fpsimd_restore x3, 4 -.endm - -.macro save_guest_regs - // x0 is the vcpu address - // x1 is the return code, do not corrupt! - // x2 is the cpu context - // x3 is a tmp register - // Guest's x0-x3 are on the stack - - // Compute base to save registers - add x3, x2, #CPU_XREG_OFFSET(4) - stp x4, x5, [x3] - stp x6, x7, [x3, #16] - stp x8, x9, [x3, #32] - stp x10, x11, [x3, #48] - stp x12, x13, [x3, #64] - stp x14, x15, [x3, #80] - stp x16, x17, [x3, #96] - str x18, [x3, #112] - - pop x6, x7 // x2, x3 - pop x4, x5 // x0, x1 - - add x3, x2, #CPU_XREG_OFFSET(0) - stp x4, x5, [x3] - stp x6, x7, [x3, #16] - - save_common_regs -.endm - -.macro restore_guest_regs - // x0 is the vcpu address. - // x2 is the cpu context - // x3 is a tmp register - - // Prepare x0-x3 for later restore - add x3, x2, #CPU_XREG_OFFSET(0) - ldp x4, x5, [x3] - ldp x6, x7, [x3, #16] - push x4, x5 // Push x0-x3 on the stack - push x6, x7 - - // x4-x18 - ldp x4, x5, [x3, #32] - ldp x6, x7, [x3, #48] - ldp x8, x9, [x3, #64] - ldp x10, x11, [x3, #80] - ldp x12, x13, [x3, #96] - ldp x14, x15, [x3, #112] - ldp x16, x17, [x3, #128] - ldr x18, [x3, #144] - - // x19-x29, lr, sp*, elr*, spsr* - restore_common_regs - - // Last bits of the 64bit state - pop x2, x3 - pop x0, x1 - - // Do not touch any register after this! -.endm - -/* - * Macros to perform system register save/restore. - * - * Ordering here is absolutely critical, and must be kept consistent - * in {save,restore}_sysregs, {save,restore}_guest_32bit_state, - * and in kvm_asm.h. - * - * In other words, don't touch any of these unless you know what - * you are doing. - */ -.macro save_sysregs - // x2: base address for cpu context - // x3: tmp register - - add x3, x2, #CPU_SYSREG_OFFSET(MPIDR_EL1) - - mrs x4, vmpidr_el2 - mrs x5, csselr_el1 - mrs x6, sctlr_el1 - mrs x7, actlr_el1 - mrs x8, cpacr_el1 - mrs x9, ttbr0_el1 - mrs x10, ttbr1_el1 - mrs x11, tcr_el1 - mrs x12, esr_el1 - mrs x13, afsr0_el1 - mrs x14, afsr1_el1 - mrs x15, far_el1 - mrs x16, mair_el1 - mrs x17, vbar_el1 - mrs x18, contextidr_el1 - mrs x19, tpidr_el0 - mrs x20, tpidrro_el0 - mrs x21, tpidr_el1 - mrs x22, amair_el1 - mrs x23, cntkctl_el1 - - stp x4, x5, [x3] - stp x6, x7, [x3, #16] - stp x8, x9, [x3, #32] - stp x10, x11, [x3, #48] - stp x12, x13, [x3, #64] - stp x14, x15, [x3, #80] - stp x16, x17, [x3, #96] - stp x18, x19, [x3, #112] - stp x20, x21, [x3, #128] - stp x22, x23, [x3, #144] -.endm - -.macro restore_sysregs - // x2: base address for cpu context - // x3: tmp register - - add x3, x2, #CPU_SYSREG_OFFSET(MPIDR_EL1) - - ldp x4, x5, [x3] - ldp x6, x7, [x3, #16] - ldp x8, x9, [x3, #32] - ldp x10, x11, [x3, #48] - ldp x12, x13, [x3, #64] - ldp x14, x15, [x3, #80] - ldp x16, x17, [x3, #96] - ldp x18, x19, [x3, #112] - ldp x20, x21, [x3, #128] - ldp x22, x23, [x3, #144] - - msr vmpidr_el2, x4 - msr csselr_el1, x5 - msr sctlr_el1, x6 - msr actlr_el1, x7 - msr cpacr_el1, x8 - msr ttbr0_el1, x9 - msr ttbr1_el1, x10 - msr tcr_el1, x11 - msr esr_el1, x12 - msr afsr0_el1, x13 - msr afsr1_el1, x14 - msr far_el1, x15 - msr mair_el1, x16 - msr vbar_el1, x17 - msr contextidr_el1, x18 - msr tpidr_el0, x19 - msr tpidrro_el0, x20 - msr tpidr_el1, x21 - msr amair_el1, x22 - msr cntkctl_el1, x23 -.endm - -.macro skip_32bit_state tmp, target - // Skip 32bit state if not needed - mrs \tmp, hcr_el2 - tbnz \tmp, #HCR_RW_SHIFT, \target -.endm - -.macro skip_tee_state tmp, target - // Skip ThumbEE state if not needed - mrs \tmp, id_pfr0_el1 - tbz \tmp, #12, \target -.endm - -.macro save_guest_32bit_state - skip_32bit_state x3, 1f - - add x3, x2, #CPU_SPSR_OFFSET(KVM_SPSR_ABT) - mrs x4, spsr_abt - mrs x5, spsr_und - mrs x6, spsr_irq - mrs x7, spsr_fiq - stp x4, x5, [x3] - stp x6, x7, [x3, #16] - - add x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2) - mrs x4, dacr32_el2 - mrs x5, ifsr32_el2 - mrs x6, fpexc32_el2 - mrs x7, dbgvcr32_el2 - stp x4, x5, [x3] - stp x6, x7, [x3, #16] - - skip_tee_state x8, 1f - - add x3, x2, #CPU_SYSREG_OFFSET(TEECR32_EL1) - mrs x4, teecr32_el1 - mrs x5, teehbr32_el1 - stp x4, x5, [x3] -1: -.endm - -.macro restore_guest_32bit_state - skip_32bit_state x3, 1f - - add x3, x2, #CPU_SPSR_OFFSET(KVM_SPSR_ABT) - ldp x4, x5, [x3] - ldp x6, x7, [x3, #16] - msr spsr_abt, x4 - msr spsr_und, x5 - msr spsr_irq, x6 - msr spsr_fiq, x7 - - add x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2) - ldp x4, x5, [x3] - ldp x6, x7, [x3, #16] - msr dacr32_el2, x4 - msr ifsr32_el2, x5 - msr fpexc32_el2, x6 - msr dbgvcr32_el2, x7 - - skip_tee_state x8, 1f - - add x3, x2, #CPU_SYSREG_OFFSET(TEECR32_EL1) - ldp x4, x5, [x3] - msr teecr32_el1, x4 - msr teehbr32_el1, x5 -1: -.endm - -.macro activate_traps - ldr x2, [x0, #VCPU_IRQ_LINES] - ldr x1, [x0, #VCPU_HCR_EL2] - orr x2, x2, x1 - msr hcr_el2, x2 - - ldr x2, =(CPTR_EL2_TTA) - msr cptr_el2, x2 - - ldr x2, =(1 << 15) // Trap CP15 Cr=15 - msr hstr_el2, x2 - - mrs x2, mdcr_el2 - and x2, x2, #MDCR_EL2_HPMN_MASK - orr x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR) - msr mdcr_el2, x2 -.endm - -.macro deactivate_traps - mov x2, #HCR_RW - msr hcr_el2, x2 - msr cptr_el2, xzr - msr hstr_el2, xzr - - mrs x2, mdcr_el2 - and x2, x2, #MDCR_EL2_HPMN_MASK - msr mdcr_el2, x2 -.endm - -.macro activate_vm - ldr x1, [x0, #VCPU_KVM] - kern_hyp_va x1 - ldr x2, [x1, #KVM_VTTBR] - msr vttbr_el2, x2 -.endm - -.macro deactivate_vm - msr vttbr_el2, xzr -.endm - -/* - * Save the VGIC CPU state into memory - * x0: Register pointing to VCPU struct - * Do not corrupt x1!!! - */ -.macro save_vgic_state - /* Get VGIC VCTRL base into x2 */ - ldr x2, [x0, #VCPU_KVM] - kern_hyp_va x2 - ldr x2, [x2, #KVM_VGIC_VCTRL] - kern_hyp_va x2 - cbz x2, 2f // disabled - - /* Compute the address of struct vgic_cpu */ - add x3, x0, #VCPU_VGIC_CPU - - /* Save all interesting registers */ - ldr w4, [x2, #GICH_HCR] - ldr w5, [x2, #GICH_VMCR] - ldr w6, [x2, #GICH_MISR] - ldr w7, [x2, #GICH_EISR0] - ldr w8, [x2, #GICH_EISR1] - ldr w9, [x2, #GICH_ELRSR0] - ldr w10, [x2, #GICH_ELRSR1] - ldr w11, [x2, #GICH_APR] - - str w4, [x3, #VGIC_CPU_HCR] - str w5, [x3, #VGIC_CPU_VMCR] - str w6, [x3, #VGIC_CPU_MISR] - str w7, [x3, #VGIC_CPU_EISR] - str w8, [x3, #(VGIC_CPU_EISR + 4)] - str w9, [x3, #VGIC_CPU_ELRSR] - str w10, [x3, #(VGIC_CPU_ELRSR + 4)] - str w11, [x3, #VGIC_CPU_APR] - - /* Clear GICH_HCR */ - str wzr, [x2, #GICH_HCR] - - /* Save list registers */ - add x2, x2, #GICH_LR0 - ldr w4, [x3, #VGIC_CPU_NR_LR] - add x3, x3, #VGIC_CPU_LR -1: ldr w5, [x2], #4 - str w5, [x3], #4 - sub w4, w4, #1 - cbnz w4, 1b -2: -.endm - -/* - * Restore the VGIC CPU state from memory - * x0: Register pointing to VCPU struct - */ -.macro restore_vgic_state - /* Get VGIC VCTRL base into x2 */ - ldr x2, [x0, #VCPU_KVM] - kern_hyp_va x2 - ldr x2, [x2, #KVM_VGIC_VCTRL] - kern_hyp_va x2 - cbz x2, 2f // disabled - - /* Compute the address of struct vgic_cpu */ - add x3, x0, #VCPU_VGIC_CPU - - /* We only restore a minimal set of registers */ - ldr w4, [x3, #VGIC_CPU_HCR] - ldr w5, [x3, #VGIC_CPU_VMCR] - ldr w6, [x3, #VGIC_CPU_APR] - - str w4, [x2, #GICH_HCR] - str w5, [x2, #GICH_VMCR] - str w6, [x2, #GICH_APR] - - /* Restore list registers */ - add x2, x2, #GICH_LR0 - ldr w4, [x3, #VGIC_CPU_NR_LR] - add x3, x3, #VGIC_CPU_LR -1: ldr w5, [x3], #4 - str w5, [x2], #4 - sub w4, w4, #1 - cbnz w4, 1b -2: -.endm - -.macro save_timer_state - // x0: vcpu pointer - ldr x2, [x0, #VCPU_KVM] - kern_hyp_va x2 - ldr w3, [x2, #KVM_TIMER_ENABLED] - cbz w3, 1f - - mrs x3, cntv_ctl_el0 - and x3, x3, #3 - str w3, [x0, #VCPU_TIMER_CNTV_CTL] - bic x3, x3, #1 // Clear Enable - msr cntv_ctl_el0, x3 - - isb - - mrs x3, cntv_cval_el0 - str x3, [x0, #VCPU_TIMER_CNTV_CVAL] - -1: - // Allow physical timer/counter access for the host - mrs x2, cnthctl_el2 - orr x2, x2, #3 - msr cnthctl_el2, x2 - - // Clear cntvoff for the host - msr cntvoff_el2, xzr -.endm - -.macro restore_timer_state - // x0: vcpu pointer - // Disallow physical timer access for the guest - // Physical counter access is allowed - mrs x2, cnthctl_el2 - orr x2, x2, #1 - bic x2, x2, #2 - msr cnthctl_el2, x2 - - ldr x2, [x0, #VCPU_KVM] - kern_hyp_va x2 - ldr w3, [x2, #KVM_TIMER_ENABLED] - cbz w3, 1f - - ldr x3, [x2, #KVM_TIMER_CNTVOFF] - msr cntvoff_el2, x3 - ldr x2, [x0, #VCPU_TIMER_CNTV_CVAL] - msr cntv_cval_el0, x2 - isb - - ldr w2, [x0, #VCPU_TIMER_CNTV_CTL] - and x2, x2, #3 - msr cntv_ctl_el0, x2 -1: -.endm - -__save_sysregs: - save_sysregs - ret - -__restore_sysregs: - restore_sysregs - ret - -__save_fpsimd: - save_fpsimd - ret - -__restore_fpsimd: - restore_fpsimd - ret - -/* - * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu); - * - * This is the world switch. The first half of the function - * deals with entering the guest, and anything from __kvm_vcpu_return - * to the end of the function deals with reentering the host. - * On the enter path, only x0 (vcpu pointer) must be preserved until - * the last moment. On the exit path, x0 (vcpu pointer) and x1 (exception - * code) must both be preserved until the epilogue. - * In both cases, x2 points to the CPU context we're saving/restoring from/to. - */ -ENTRY(__kvm_vcpu_run) - kern_hyp_va x0 - msr tpidr_el2, x0 // Save the vcpu register - - // Host context - ldr x2, [x0, #VCPU_HOST_CONTEXT] - kern_hyp_va x2 - - save_host_regs - bl __save_fpsimd - bl __save_sysregs - - activate_traps - activate_vm - - restore_vgic_state - restore_timer_state - - // Guest context - add x2, x0, #VCPU_CONTEXT - - bl __restore_sysregs - bl __restore_fpsimd - restore_guest_32bit_state - restore_guest_regs - - // That's it, no more messing around. - eret - -__kvm_vcpu_return: - // Assume x0 is the vcpu pointer, x1 the return code - // Guest's x0-x3 are on the stack - - // Guest context - add x2, x0, #VCPU_CONTEXT - - save_guest_regs - bl __save_fpsimd - bl __save_sysregs - save_guest_32bit_state - - save_timer_state - save_vgic_state - - deactivate_traps - deactivate_vm - - // Host context - ldr x2, [x0, #VCPU_HOST_CONTEXT] - kern_hyp_va x2 - - bl __restore_sysregs - bl __restore_fpsimd - restore_host_regs - - mov x0, x1 - ret -END(__kvm_vcpu_run) - -// void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); -ENTRY(__kvm_tlb_flush_vmid_ipa) - kern_hyp_va x0 - ldr x2, [x0, #KVM_VTTBR] - msr vttbr_el2, x2 - isb - - /* - * We could do so much better if we had the VA as well. - * Instead, we invalidate Stage-2 for this IPA, and the - * whole of Stage-1. Weep... - */ - tlbi ipas2e1is, x1 - dsb sy - tlbi vmalle1is - dsb sy - isb - - msr vttbr_el2, xzr - ret -ENDPROC(__kvm_tlb_flush_vmid_ipa) - -ENTRY(__kvm_flush_vm_context) - tlbi alle1is - ic ialluis - dsb sy - ret -ENDPROC(__kvm_flush_vm_context) - -__kvm_hyp_panic: - // Guess the context by looking at VTTBR: - // If zero, then we're already a host. - // Otherwise restore a minimal host context before panicing. - mrs x0, vttbr_el2 - cbz x0, 1f - - mrs x0, tpidr_el2 - - deactivate_traps - deactivate_vm - - ldr x2, [x0, #VCPU_HOST_CONTEXT] - kern_hyp_va x2 - - bl __restore_sysregs - -1: adr x0, __hyp_panic_str - adr x1, 2f - ldp x2, x3, [x1] - sub x0, x0, x2 - add x0, x0, x3 - mrs x1, spsr_el2 - mrs x2, elr_el2 - mrs x3, esr_el2 - mrs x4, far_el2 - mrs x5, hpfar_el2 - mrs x6, par_el1 - mrs x7, tpidr_el2 - - mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ - PSR_MODE_EL1h) - msr spsr_el2, lr - ldr lr, =panic - msr elr_el2, lr - eret - - .align 3 -2: .quad HYP_PAGE_OFFSET - .quad PAGE_OFFSET -ENDPROC(__kvm_hyp_panic) - -__hyp_panic_str: - .ascii "HYP panic:\nPS:%08x PC:%p ESR:%p\nFAR:%p HPFAR:%p PAR:%p\nVCPU:%p\n\0" - - .align 2 - -ENTRY(kvm_call_hyp) - hvc #0 - ret -ENDPROC(kvm_call_hyp) - -.macro invalid_vector label, target - .align 2 -\label: - b \target -ENDPROC(\label) -.endm - - /* None of these should ever happen */ - invalid_vector el2t_sync_invalid, __kvm_hyp_panic - invalid_vector el2t_irq_invalid, __kvm_hyp_panic - invalid_vector el2t_fiq_invalid, __kvm_hyp_panic - invalid_vector el2t_error_invalid, __kvm_hyp_panic - invalid_vector el2h_sync_invalid, __kvm_hyp_panic - invalid_vector el2h_irq_invalid, __kvm_hyp_panic - invalid_vector el2h_fiq_invalid, __kvm_hyp_panic - invalid_vector el2h_error_invalid, __kvm_hyp_panic - invalid_vector el1_sync_invalid, __kvm_hyp_panic - invalid_vector el1_irq_invalid, __kvm_hyp_panic - invalid_vector el1_fiq_invalid, __kvm_hyp_panic - invalid_vector el1_error_invalid, __kvm_hyp_panic - -el1_sync: // Guest trapped into EL2 - push x0, x1 - push x2, x3 - - mrs x1, esr_el2 - lsr x2, x1, #ESR_EL2_EC_SHIFT - - cmp x2, #ESR_EL2_EC_HVC64 - b.ne el1_trap - - mrs x3, vttbr_el2 // If vttbr is valid, the 64bit guest - cbnz x3, el1_trap // called HVC - - /* Here, we're pretty sure the host called HVC. */ - pop x2, x3 - pop x0, x1 - - push lr, xzr - - /* - * Compute the function address in EL2, and shuffle the parameters. - */ - kern_hyp_va x0 - mov lr, x0 - mov x0, x1 - mov x1, x2 - mov x2, x3 - blr lr - - pop lr, xzr - eret - -el1_trap: - /* - * x1: ESR - * x2: ESR_EC - */ - cmp x2, #ESR_EL2_EC_DABT - mov x0, #ESR_EL2_EC_IABT - ccmp x2, x0, #4, ne - b.ne 1f // Not an abort we care about - - /* This is an abort. Check for permission fault */ - and x2, x1, #ESR_EL2_FSC_TYPE - cmp x2, #FSC_PERM - b.ne 1f // Not a permission fault - - /* - * Check for Stage-1 page table walk, which is guaranteed - * to give a valid HPFAR_EL2. - */ - tbnz x1, #7, 1f // S1PTW is set - - /* - * Permission fault, HPFAR_EL2 is invalid. - * Resolve the IPA the hard way using the guest VA. - * Stage-1 translation already validated the memory access rights. - * As such, we can use the EL1 translation regime, and don't have - * to distinguish between EL0 and EL1 access. - */ - mrs x2, far_el2 - at s1e1r, x2 - isb - - /* Read result */ - mrs x3, par_el1 - tbnz x3, #0, 3f // Bail out if we failed the translation - ubfx x3, x3, #12, #36 // Extract IPA - lsl x3, x3, #4 // and present it like HPFAR - b 2f - -1: mrs x3, hpfar_el2 - mrs x2, far_el2 - -2: mrs x0, tpidr_el2 - str x1, [x0, #VCPU_ESR_EL2] - str x2, [x0, #VCPU_FAR_EL2] - str x3, [x0, #VCPU_HPFAR_EL2] - - mov x1, #ARM_EXCEPTION_TRAP - b __kvm_vcpu_return - - /* - * Translation failed. Just return to the guest and - * let it fault again. Another CPU is probably playing - * behind our back. - */ -3: pop x2, x3 - pop x0, x1 - - eret - -el1_irq: - push x0, x1 - push x2, x3 - mrs x0, tpidr_el2 - mov x1, #ARM_EXCEPTION_IRQ - b __kvm_vcpu_return - - .ltorg - - .align 11 - -ENTRY(__kvm_hyp_vector) - ventry el2t_sync_invalid // Synchronous EL2t - ventry el2t_irq_invalid // IRQ EL2t - ventry el2t_fiq_invalid // FIQ EL2t - ventry el2t_error_invalid // Error EL2t - - ventry el2h_sync_invalid // Synchronous EL2h - ventry el2h_irq_invalid // IRQ EL2h - ventry el2h_fiq_invalid // FIQ EL2h - ventry el2h_error_invalid // Error EL2h - - ventry el1_sync // Synchronous 64-bit EL1 - ventry el1_irq // IRQ 64-bit EL1 - ventry el1_fiq_invalid // FIQ 64-bit EL1 - ventry el1_error_invalid // Error 64-bit EL1 - - ventry el1_sync // Synchronous 32-bit EL1 - ventry el1_irq // IRQ 32-bit EL1 - ventry el1_fiq_invalid // FIQ 32-bit EL1 - ventry el1_error_invalid // Error 32-bit EL1 -ENDPROC(__kvm_hyp_vector) - -__kvm_hyp_code_end: - .globl __kvm_hyp_code_end - - .popsection diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile new file mode 100644 index 000000000000..d61e44642f98 --- /dev/null +++ b/arch/arm64/kvm/hyp/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Kernel-based Virtual Machine module, HYP part +# + +incdir := $(src)/include +subdir-asflags-y := -I$(incdir) +subdir-ccflags-y := -I$(incdir) + +obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o diff --git a/arch/arm64/kvm/emulate.c b/arch/arm64/kvm/hyp/aarch32.c index 124418d17049..449fa58cf3b6 100644 --- a/arch/arm64/kvm/emulate.c +++ b/arch/arm64/kvm/hyp/aarch32.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * (not much of an) Emulation layer for 32bit guests. + * Hyp portion of the (not much of an) Emulation layer for 32bit guests. * * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> @@ -7,22 +8,11 @@ * based on arch/arm/kvm/emulate.c * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> /* * stolen from arch/arm/kernel/opcodes.c @@ -51,16 +41,6 @@ static const unsigned short cc_map[16] = { 0 /* NV */ }; -static int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) -{ - u32 esr = kvm_vcpu_get_hsr(vcpu); - - if (esr & ESR_EL2_CV) - return (esr & ESR_EL2_COND) >> ESR_EL2_COND_SHIFT; - - return -1; -} - /* * Check if a trapped instruction should have been executed or not. */ @@ -70,9 +50,23 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) u32 cpsr_cond; int cond; - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_hsr(vcpu) >> 30) + /* + * These are the exception classes that could fire with a + * conditional instruction. + */ + switch (kvm_vcpu_trap_get_class(vcpu)) { + case ESR_ELx_EC_CP15_32: + case ESR_ELx_EC_CP15_64: + case ESR_ELx_EC_CP14_MR: + case ESR_ELx_EC_CP14_LS: + case ESR_ELx_EC_FP_ASIMD: + case ESR_ELx_EC_CP10_ID: + case ESR_ELx_EC_CP14_64: + case ESR_ELx_EC_SVC32: + break; + default: return true; + } /* Is condition field valid? */ cond = kvm_vcpu_get_condition(vcpu); @@ -104,7 +98,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) } /** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block + * kvm_adjust_itstate - adjust ITSTATE when emulating instructions in IT-block * @vcpu: The VCPU pointer * * When exceptions occur while instructions are executed in Thumb IF-THEN @@ -117,11 +111,9 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) { unsigned long itbits, cond; unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_arm = !(cpsr & COMPAT_PSR_T_BIT); + bool is_arm = !(cpsr & PSR_AA32_T_BIT); - BUG_ON(is_arm && (cpsr & COMPAT_PSR_IT_MASK)); - - if (!(cpsr & COMPAT_PSR_IT_MASK)) + if (is_arm || !(cpsr & PSR_AA32_IT_MASK)) return; cond = (cpsr & 0xe000) >> 13; @@ -134,7 +126,7 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) else itbits = (itbits << 1) & 0x1f; - cpsr &= ~COMPAT_PSR_IT_MASK; + cpsr &= ~PSR_AA32_IT_MASK; cpsr |= cond << 13; cpsr |= (itbits & 0x1c) << (10 - 2); cpsr |= (itbits & 0x3) << 25; @@ -142,17 +134,21 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) } /** - * kvm_skip_instr - skip a trapped instruction and proceed to the next + * kvm_skip_instr32 - skip a trapped instruction and proceed to the next * @vcpu: The vcpu pointer */ -void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) +void kvm_skip_instr32(struct kvm_vcpu *vcpu) { + u32 pc = *vcpu_pc(vcpu); bool is_thumb; - is_thumb = !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_T_BIT); - if (is_thumb && !is_wide_instr) - *vcpu_pc(vcpu) += 2; + is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT); + if (is_thumb && !kvm_vcpu_trap_il_is32bit(vcpu)) + pc += 2; else - *vcpu_pc(vcpu) += 4; + pc += 4; + + *vcpu_pc(vcpu) = pc; + kvm_adjust_itstate(vcpu); } diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S new file mode 100644 index 000000000000..9f4e8d68ab50 --- /dev/null +++ b/arch/arm64/kvm/hyp/entry.S @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/linkage.h> + +#include <asm/alternative.h> +#include <asm/assembler.h> +#include <asm/fpsimdmacros.h> +#include <asm/kvm.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_mte.h> +#include <asm/kvm_ptrauth.h> + + .text + +/* + * u64 __guest_enter(struct kvm_vcpu *vcpu); + */ +SYM_FUNC_START(__guest_enter) + // x0: vcpu + // x1-x17: clobbered by macros + // x29: guest context + + adr_this_cpu x1, kvm_hyp_ctxt, x2 + + // Store the hyp regs + save_callee_saved_regs x1 + + // Save hyp's sp_el0 + save_sp_el0 x1, x2 + + // Now the hyp state is stored if we have a pending RAS SError it must + // affect the host or hyp. If any asynchronous exception is pending we + // defer the guest entry. The DSB isn't necessary before v8.2 as any + // SError would be fatal. +alternative_if ARM64_HAS_RAS_EXTN + dsb nshst + isb +alternative_else_nop_endif + mrs x1, isr_el1 + cbz x1, 1f + + // Ensure that __guest_enter() always provides a context + // synchronization event so that callers don't need ISBs for anything + // that would usually be synchonized by the ERET. + isb + mov x0, #ARM_EXCEPTION_IRQ + ret + +1: + set_loaded_vcpu x0, x1, x2 + + add x29, x0, #VCPU_CONTEXT + + // mte_switch_to_guest(g_ctxt, h_ctxt, tmp1) + mte_switch_to_guest x29, x1, x2 + + // Macro ptrauth_switch_to_guest format: + // ptrauth_switch_to_guest(guest cxt, tmp1, tmp2, tmp3) + // The below macro to restore guest keys is not implemented in C code + // as it may cause Pointer Authentication key signing mismatch errors + // when this feature is enabled for kernel code. + ptrauth_switch_to_guest x29, x0, x1, x2 + + // Restore the guest's sp_el0 + restore_sp_el0 x29, x0 + + // Restore guest regs x0-x17 + ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] + ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] + ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] + ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)] + ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)] + ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)] + ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)] + ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)] + ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)] + + // Restore guest regs x18-x29, lr + restore_callee_saved_regs x29 + + // Do not touch any register after this! + eret + sb + +SYM_INNER_LABEL(__guest_exit_restore_elr_and_panic, SYM_L_GLOBAL) + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack + + adr_this_cpu x0, kvm_hyp_ctxt, x1 + ldr x0, [x0, #CPU_ELR_EL2] + msr elr_el2, x0 + +SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack + + // If the hyp context is loaded, go straight to hyp_panic + get_loaded_vcpu x0, x1 + cbnz x0, 1f + b hyp_panic + +1: + // The hyp context is saved so make sure it is restored to allow + // hyp_panic to run at hyp and, subsequently, panic to run in the host. + // This makes use of __guest_exit to avoid duplication but sets the + // return address to tail call into hyp_panic. As a side effect, the + // current state is saved to the guest context but it will only be + // accurate if the guest had been completely restored. + adr_this_cpu x0, kvm_hyp_ctxt, x1 + adr_l x1, hyp_panic + str x1, [x0, #CPU_XREG_OFFSET(30)] + + get_vcpu_ptr x1, x0 + +SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) + // x0: return code + // x1: vcpu + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack + + add x1, x1, #VCPU_CONTEXT + + ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) + + // Store the guest regs x2 and x3 + stp x2, x3, [x1, #CPU_XREG_OFFSET(2)] + + // Retrieve the guest regs x0-x1 from the stack + ldp x2, x3, [sp], #16 // x0, x1 + + // Store the guest regs x0-x1 and x4-x17 + stp x2, x3, [x1, #CPU_XREG_OFFSET(0)] + stp x4, x5, [x1, #CPU_XREG_OFFSET(4)] + stp x6, x7, [x1, #CPU_XREG_OFFSET(6)] + stp x8, x9, [x1, #CPU_XREG_OFFSET(8)] + stp x10, x11, [x1, #CPU_XREG_OFFSET(10)] + stp x12, x13, [x1, #CPU_XREG_OFFSET(12)] + stp x14, x15, [x1, #CPU_XREG_OFFSET(14)] + stp x16, x17, [x1, #CPU_XREG_OFFSET(16)] + + // Store the guest regs x18-x29, lr + save_callee_saved_regs x1 + + // Store the guest's sp_el0 + save_sp_el0 x1, x2 + + adr_this_cpu x2, kvm_hyp_ctxt, x3 + + // Macro ptrauth_switch_to_hyp format: + // ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3) + // The below macro to save/restore keys is not implemented in C code + // as it may cause Pointer Authentication key signing mismatch errors + // when this feature is enabled for kernel code. + ptrauth_switch_to_hyp x1, x2, x3, x4, x5 + + // mte_switch_to_hyp(g_ctxt, h_ctxt, reg1) + mte_switch_to_hyp x1, x2, x3 + + // Restore hyp's sp_el0 + restore_sp_el0 x2, x3 + + // Now restore the hyp regs + restore_callee_saved_regs x2 + + set_loaded_vcpu xzr, x2, x3 + +alternative_if ARM64_HAS_RAS_EXTN + // If we have the RAS extensions we can consume a pending error + // without an unmask-SError and isb. The ESB-instruction consumed any + // pending guest error when we took the exception from the guest. + mrs_s x2, SYS_DISR_EL1 + str x2, [x1, #(VCPU_FAULT_DISR - VCPU_CONTEXT)] + cbz x2, 1f + msr_s SYS_DISR_EL1, xzr + orr x0, x0, #(1<<ARM_EXIT_WITH_SERROR_BIT) +1: ret +alternative_else + dsb sy // Synchronize against in-flight ld/st + isb // Prevent an early read of side-effect free ISR + mrs x2, isr_el1 + tbnz x2, #ISR_EL1_A_SHIFT, 2f + ret + nop +2: +alternative_endif + // We know we have a pending asynchronous abort, now is the + // time to flush it out. From your VAXorcist book, page 666: + // "Threaten me not, oh Evil one! For I speak with + // the power of DEC, and I command thee to show thyself!" + mrs x2, elr_el2 + mrs x3, esr_el2 + mrs x4, spsr_el2 + mov x5, x0 + + msr daifclr, #4 // Unmask aborts + + // This is our single instruction exception window. A pending + // SError is guaranteed to occur at the earliest when we unmask + // it, and at the latest just after the ISB. +abort_guest_exit_start: + + isb + +abort_guest_exit_end: + + msr daifset, #4 // Mask aborts + ret + + _kvm_extable abort_guest_exit_start, 9997f + _kvm_extable abort_guest_exit_end, 9997f +9997: + msr daifset, #4 // Mask aborts + mov x0, #(1 << ARM_EXIT_WITH_SERROR_BIT) + + // restore the EL1 exception context so that we can report some + // information. Merge the exception code with the SError pending bit. + msr elr_el2, x2 + msr esr_el2, x3 + msr spsr_el2, x4 + orr x0, x0, x5 +1: ret +SYM_FUNC_END(__guest_enter) diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c new file mode 100644 index 000000000000..bef40ddb16db --- /dev/null +++ b/arch/arm64/kvm/hyp/exception.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Fault injection for both 32 and 64bit guests. + * + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + * + * Based on arch/arm/kvm/emulate.c + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + */ + +#include <hyp/adjust_pc.h> +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> + +#if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__) +#error Hypervisor code only! +#endif + +static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) +{ + if (has_vhe()) + return vcpu_read_sys_reg(vcpu, reg); + + return __vcpu_sys_reg(vcpu, reg); +} + +static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) +{ + if (has_vhe()) + vcpu_write_sys_reg(vcpu, val, reg); + else + __vcpu_assign_sys_reg(vcpu, reg, val); +} + +static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, + u64 val) +{ + if (has_vhe()) { + if (target_mode == PSR_MODE_EL1h) + vcpu_write_sys_reg(vcpu, val, SPSR_EL1); + else + vcpu_write_sys_reg(vcpu, val, SPSR_EL2); + } else { + __vcpu_assign_sys_reg(vcpu, SPSR_EL1, val); + } +} + +static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) +{ + if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) + write_sysreg(val, spsr_abt); + else + vcpu->arch.ctxt.spsr_abt = val; +} + +static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val) +{ + if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) + write_sysreg(val, spsr_und); + else + vcpu->arch.ctxt.spsr_und = val; +} + +/* + * This performs the exception entry at a given EL (@target_mode), stashing PC + * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE. + * The EL passed to this function *must* be a non-secure, privileged mode with + * bit 0 being set (PSTATE.SP == 1). + * + * When an exception is taken, most PSTATE fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all + * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx + * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0. + * + * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429. + * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426. + * + * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from + * MSB to LSB. + */ +static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, + enum exception_type type) +{ + unsigned long sctlr, vbar, old, new, mode; + u64 exc_offset; + + mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + + if (mode == target_mode) + exc_offset = CURRENT_EL_SP_ELx_VECTOR; + else if ((mode | PSR_MODE_THREAD_BIT) == target_mode) + exc_offset = CURRENT_EL_SP_EL0_VECTOR; + else if (!(mode & PSR_MODE32_BIT)) + exc_offset = LOWER_EL_AArch64_VECTOR; + else + exc_offset = LOWER_EL_AArch32_VECTOR; + + switch (target_mode) { + case PSR_MODE_EL1h: + vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1); + sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); + __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1); + break; + case PSR_MODE_EL2h: + vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL2); + sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL2); + __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL2); + break; + default: + /* Don't do that */ + BUG(); + } + + *vcpu_pc(vcpu) = vbar + exc_offset + type; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_N_BIT); + new |= (old & PSR_Z_BIT); + new |= (old & PSR_C_BIT); + new |= (old & PSR_V_BIT); + + if (kvm_has_mte(kern_hyp_va(vcpu->kvm))) + new |= PSR_TCO_BIT; + + new |= (old & PSR_DIT_BIT); + + // PSTATE.UAO is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D5-2579. + + // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0 + // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page D5-2578. + new |= (old & PSR_PAN_BIT); + if (!(sctlr & SCTLR_EL1_SPAN)) + new |= PSR_PAN_BIT; + + // PSTATE.SS is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D2-2452. + + // PSTATE.IL is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D1-2306. + + // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64 + // See ARM DDI 0487E.a, page D13-3258 + if (sctlr & SCTLR_ELx_DSSBS) + new |= PSR_SSBS_BIT; + + // PSTATE.BTYPE is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, pages D1-2293 to D1-2294. + + new |= PSR_D_BIT; + new |= PSR_A_BIT; + new |= PSR_I_BIT; + new |= PSR_F_BIT; + + new |= target_mode; + + *vcpu_cpsr(vcpu) = new; + __vcpu_write_spsr(vcpu, target_mode, old); +} + +/* + * When an exception is taken, most CPSR fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). + * + * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with + * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was + * obsoleted by the ARMv7 virtualization extensions and is RES0. + * + * For the SPSR layout seen from AArch32, see: + * - ARM DDI 0406C.d, page B1-1148 + * - ARM DDI 0487E.a, page G8-6264 + * + * For the SPSR_ELx layout for AArch32 seen from AArch64, see: + * - ARM DDI 0487E.a, page C5-426 + * + * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from + * MSB to LSB. + */ +static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) +{ + u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); + unsigned long old, new; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_AA32_N_BIT); + new |= (old & PSR_AA32_Z_BIT); + new |= (old & PSR_AA32_C_BIT); + new |= (old & PSR_AA32_V_BIT); + new |= (old & PSR_AA32_Q_BIT); + + // CPSR.IT[7:0] are set to zero upon any exception + // See ARM DDI 0487E.a, section G1.12.3 + // See ARM DDI 0406C.d, section B1.8.3 + + new |= (old & PSR_AA32_DIT_BIT); + + // CPSR.SSBS is set to SCTLR.DSSBS upon any exception + // See ARM DDI 0487E.a, page G8-6244 + if (sctlr & BIT(31)) + new |= PSR_AA32_SSBS_BIT; + + // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 + // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page G8-6246 + new |= (old & PSR_AA32_PAN_BIT); + if (!(sctlr & BIT(23))) + new |= PSR_AA32_PAN_BIT; + + // SS does not exist in AArch32, so ignore + + // CPSR.IL is set to zero upon any exception + // See ARM DDI 0487E.a, page G1-5527 + + new |= (old & PSR_AA32_GE_MASK); + + // CPSR.IT[7:0] are set to zero upon any exception + // See prior comment above + + // CPSR.E is set to SCTLR.EE upon any exception + // See ARM DDI 0487E.a, page G8-6245 + // See ARM DDI 0406C.d, page B4-1701 + if (sctlr & BIT(25)) + new |= PSR_AA32_E_BIT; + + // CPSR.A is unchanged upon an exception to Undefined, Supervisor + // CPSR.A is set upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_A_BIT); + if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) + new |= PSR_AA32_A_BIT; + + // CPSR.I is set upon any exception + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= PSR_AA32_I_BIT; + + // CPSR.F is set upon an exception to FIQ + // CPSR.F is unchanged upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_F_BIT); + if (mode == PSR_AA32_MODE_FIQ) + new |= PSR_AA32_F_BIT; + + // CPSR.T is set to SCTLR.TE upon any exception + // See ARM DDI 0487E.a, page G8-5514 + // See ARM DDI 0406C.d, page B1-1181 + if (sctlr & BIT(30)) + new |= PSR_AA32_T_BIT; + + new |= mode; + + return new; +} + +/* + * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. + */ +static const u8 return_offsets[8][2] = { + [0] = { 0, 0 }, /* Reset, unused */ + [1] = { 4, 2 }, /* Undefined */ + [2] = { 0, 0 }, /* SVC, unused */ + [3] = { 4, 4 }, /* Prefetch abort */ + [4] = { 8, 8 }, /* Data abort */ + [5] = { 0, 0 }, /* HVC, unused */ + [6] = { 4, 4 }, /* IRQ, unused */ + [7] = { 4, 4 }, /* FIQ, unused */ +}; + +static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) +{ + unsigned long spsr = *vcpu_cpsr(vcpu); + bool is_thumb = (spsr & PSR_AA32_T_BIT); + u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); + u32 return_address; + + *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); + return_address = *vcpu_pc(vcpu); + return_address += return_offsets[vect_offset >> 2][is_thumb]; + + /* KVM only enters the ABT and UND modes, so only deal with those */ + switch(mode) { + case PSR_AA32_MODE_ABT: + __vcpu_write_spsr_abt(vcpu, host_spsr_to_spsr32(spsr)); + vcpu_gp_regs(vcpu)->compat_lr_abt = return_address; + break; + + case PSR_AA32_MODE_UND: + __vcpu_write_spsr_und(vcpu, host_spsr_to_spsr32(spsr)); + vcpu_gp_regs(vcpu)->compat_lr_und = return_address; + break; + } + + /* Branch to exception vector */ + if (sctlr & (1 << 13)) + vect_offset += 0xffff0000; + else /* always have security exceptions */ + vect_offset += __vcpu_read_sys_reg(vcpu, VBAR_EL1); + + *vcpu_pc(vcpu) = vect_offset; +} + +static void kvm_inject_exception(struct kvm_vcpu *vcpu) +{ + if (vcpu_el1_is_32bit(vcpu)) { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA32_UND): + enter_exception32(vcpu, PSR_AA32_MODE_UND, 4); + break; + case unpack_vcpu_flag(EXCEPT_AA32_IABT): + enter_exception32(vcpu, PSR_AA32_MODE_ABT, 12); + break; + case unpack_vcpu_flag(EXCEPT_AA32_DABT): + enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16); + break; + default: + /* Err... */ + break; + } + } else { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); + break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR): + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_serror); + break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): + enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync); + break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL2_IRQ): + enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq); + break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR): + enter_exception64(vcpu, PSR_MODE_EL2h, except_type_serror); + break; + + default: + /* + * Only EL1_{SYNC,SERR} and EL2_{SYNC,IRQ,SERR} makes + * sense so far. Everything else gets silently + * ignored. + */ + break; + } + } +} + +/* + * Adjust the guest PC (and potentially exception state) depending on + * flags provided by the emulation code. + */ +void __kvm_adjust_pc(struct kvm_vcpu *vcpu) +{ + if (vcpu_get_flag(vcpu, PENDING_EXCEPTION)) { + kvm_inject_exception(vcpu); + vcpu_clear_flag(vcpu, PENDING_EXCEPTION); + vcpu_clear_flag(vcpu, EXCEPT_MASK); + } else if (vcpu_get_flag(vcpu, INCREMENT_PC)) { + kvm_skip_instr(vcpu); + vcpu_clear_flag(vcpu, INCREMENT_PC); + } +} diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S new file mode 100644 index 000000000000..e950875e31ce --- /dev/null +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/linkage.h> + +#include <asm/fpsimdmacros.h> + + .text + +SYM_FUNC_START(__fpsimd_save_state) + fpsimd_save x0, 1 + ret +SYM_FUNC_END(__fpsimd_save_state) + +SYM_FUNC_START(__fpsimd_restore_state) + fpsimd_restore x0, 1 + ret +SYM_FUNC_END(__fpsimd_restore_state) + +SYM_FUNC_START(__sve_restore_state) + mov x2, #1 + sve_load 0, x1, x2, 3 + ret +SYM_FUNC_END(__sve_restore_state) + +SYM_FUNC_START(__sve_save_state) + mov x2, #1 + sve_save 0, x1, x2, 3 + ret +SYM_FUNC_END(__sve_save_state) diff --git a/arch/arm64/kvm/hyp/hyp-constants.c b/arch/arm64/kvm/hyp/hyp-constants.c new file mode 100644 index 000000000000..b257a3b4bfc5 --- /dev/null +++ b/arch/arm64/kvm/hyp/hyp-constants.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/kbuild.h> +#include <nvhe/memory.h> +#include <nvhe/pkvm.h> + +int main(void) +{ + DEFINE(STRUCT_HYP_PAGE_SIZE, sizeof(struct hyp_page)); + DEFINE(PKVM_HYP_VM_SIZE, sizeof(struct pkvm_hyp_vm)); + DEFINE(PKVM_HYP_VCPU_SIZE, sizeof(struct pkvm_hyp_vcpu)); + return 0; +} diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S new file mode 100644 index 000000000000..03f97d71984c --- /dev/null +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -0,0 +1,264 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015-2018 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/arm-smccc.h> +#include <linux/linkage.h> + +#include <asm/alternative.h> +#include <asm/assembler.h> +#include <asm/cpufeature.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> +#include <asm/mmu.h> +#include <asm/spectre.h> + +.macro save_caller_saved_regs_vect + /* x0 and x1 were saved in the vector entry */ + stp x2, x3, [sp, #-16]! + stp x4, x5, [sp, #-16]! + stp x6, x7, [sp, #-16]! + stp x8, x9, [sp, #-16]! + stp x10, x11, [sp, #-16]! + stp x12, x13, [sp, #-16]! + stp x14, x15, [sp, #-16]! + stp x16, x17, [sp, #-16]! +.endm + +.macro restore_caller_saved_regs_vect + ldp x16, x17, [sp], #16 + ldp x14, x15, [sp], #16 + ldp x12, x13, [sp], #16 + ldp x10, x11, [sp], #16 + ldp x8, x9, [sp], #16 + ldp x6, x7, [sp], #16 + ldp x4, x5, [sp], #16 + ldp x2, x3, [sp], #16 + ldp x0, x1, [sp], #16 +.endm + + .text + +el1_sync: // Guest trapped into EL2 + + mrs x0, esr_el2 + ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH + cmp x0, #ESR_ELx_EC_HVC64 + ccmp x0, #ESR_ELx_EC_HVC32, #4, ne + b.ne el1_trap + + /* + * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1. + * The workaround has already been applied on the host, + * so let's quickly get back to the guest. We don't bother + * restoring x1, as it can be clobbered anyway. + */ + ldr x1, [sp] // Guest's x0 + eor w1, w1, #ARM_SMCCC_ARCH_WORKAROUND_1 + cbz w1, wa_epilogue + + /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ + eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_1 ^ \ + ARM_SMCCC_ARCH_WORKAROUND_2) + cbz w1, wa_epilogue + + eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_2 ^ \ + ARM_SMCCC_ARCH_WORKAROUND_3) + cbnz w1, el1_trap + +wa_epilogue: + mov x0, xzr + add sp, sp, #16 + eret + sb + +el1_trap: + get_vcpu_ptr x1, x0 + mov x0, #ARM_EXCEPTION_TRAP + b __guest_exit + +el1_irq: +el1_fiq: + get_vcpu_ptr x1, x0 + mov x0, #ARM_EXCEPTION_IRQ + b __guest_exit + +el1_error: + get_vcpu_ptr x1, x0 + mov x0, #ARM_EXCEPTION_EL1_SERROR + b __guest_exit + +el2_sync: + /* Check for illegal exception return */ + mrs x0, spsr_el2 + tbnz x0, #20, 1f + + save_caller_saved_regs_vect + stp x29, x30, [sp, #-16]! + bl kvm_unexpected_el2_exception + ldp x29, x30, [sp], #16 + restore_caller_saved_regs_vect + + eret + +1: + /* Let's attempt a recovery from the illegal exception return */ + get_vcpu_ptr x1, x0 + mov x0, #ARM_EXCEPTION_IL + b __guest_exit + + +el2_error: + save_caller_saved_regs_vect + stp x29, x30, [sp, #-16]! + + bl kvm_unexpected_el2_exception + + ldp x29, x30, [sp], #16 + restore_caller_saved_regs_vect + + eret + sb + +.macro invalid_vector label, target = __guest_exit_panic + .align 2 +SYM_CODE_START_LOCAL(\label) + b \target +SYM_CODE_END(\label) +.endm + + /* None of these should ever happen */ + invalid_vector el2t_sync_invalid + invalid_vector el2t_irq_invalid + invalid_vector el2t_fiq_invalid + invalid_vector el2t_error_invalid + invalid_vector el2h_irq_invalid + invalid_vector el2h_fiq_invalid + + .ltorg + + .align 11 + +.macro check_preamble_length start, end +/* kvm_patch_vector_branch() generates code that jumps over the preamble. */ +.if ((\end-\start) != KVM_VECTOR_PREAMBLE) + .error "KVM vector preamble length mismatch" +.endif +.endm + +.macro valid_vect target + .align 7 +661: + esb + stp x0, x1, [sp, #-16]! +662: + /* + * spectre vectors __bp_harden_hyp_vecs generate br instructions at runtime + * that jump at offset 8 at __kvm_hyp_vector. + * As hyp .text is guarded section, it needs bti j. + */ + bti j + b \target + +check_preamble_length 661b, 662b +.endm + +.macro invalid_vect target + .align 7 +661: + nop + stp x0, x1, [sp, #-16]! +662: + /* Check valid_vect */ + bti j + b \target + +check_preamble_length 661b, 662b +.endm + +SYM_CODE_START(__kvm_hyp_vector) + invalid_vect el2t_sync_invalid // Synchronous EL2t + invalid_vect el2t_irq_invalid // IRQ EL2t + invalid_vect el2t_fiq_invalid // FIQ EL2t + invalid_vect el2t_error_invalid // Error EL2t + + valid_vect el2_sync // Synchronous EL2h + invalid_vect el2h_irq_invalid // IRQ EL2h + invalid_vect el2h_fiq_invalid // FIQ EL2h + valid_vect el2_error // Error EL2h + + valid_vect el1_sync // Synchronous 64-bit EL1 + valid_vect el1_irq // IRQ 64-bit EL1 + valid_vect el1_fiq // FIQ 64-bit EL1 + valid_vect el1_error // Error 64-bit EL1 + + valid_vect el1_sync // Synchronous 32-bit EL1 + valid_vect el1_irq // IRQ 32-bit EL1 + valid_vect el1_fiq // FIQ 32-bit EL1 + valid_vect el1_error // Error 32-bit EL1 +SYM_CODE_END(__kvm_hyp_vector) + +.macro spectrev2_smccc_wa1_smc + sub sp, sp, #(8 * 4) + stp x2, x3, [sp, #(8 * 0)] + stp x0, x1, [sp, #(8 * 2)] + alternative_cb ARM64_ALWAYS_SYSTEM, spectre_bhb_patch_wa3 + /* Patched to mov WA3 when supported */ + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1 + alternative_cb_end + smc #0 + ldp x2, x3, [sp, #(8 * 0)] + add sp, sp, #(8 * 2) +.endm + +.macro hyp_ventry indirect, spectrev2 + .align 7 +1: esb + .if \spectrev2 != 0 + spectrev2_smccc_wa1_smc + .else + stp x0, x1, [sp, #-16]! + mitigate_spectre_bhb_loop x0 + mitigate_spectre_bhb_clear_insn + .endif + .if \indirect != 0 + alternative_cb ARM64_ALWAYS_SYSTEM, kvm_patch_vector_branch + /* + * For ARM64_SPECTRE_V3A configurations, these NOPs get replaced with: + * + * movz x0, #(addr & 0xffff) + * movk x0, #((addr >> 16) & 0xffff), lsl #16 + * movk x0, #((addr >> 32) & 0xffff), lsl #32 + * br x0 + * + * Where: + * addr = kern_hyp_va(__kvm_hyp_vector) + vector-offset + KVM_VECTOR_PREAMBLE. + * See kvm_patch_vector_branch for details. + */ + nop + nop + nop + nop + alternative_cb_end + .endif + b __kvm_hyp_vector + (1b - 0b + KVM_VECTOR_PREAMBLE) +.endm + +.macro generate_vectors indirect, spectrev2 +0: + .rept 16 + hyp_ventry \indirect, \spectrev2 + .endr + .org 0b + SZ_2K // Safety measure +.endm + + .align 11 +SYM_CODE_START(__bp_harden_hyp_vecs) + generate_vectors indirect = 0, spectrev2 = 1 // HYP_VECTOR_SPECTRE_DIRECT + generate_vectors indirect = 1, spectrev2 = 0 // HYP_VECTOR_INDIRECT + generate_vectors indirect = 1, spectrev2 = 1 // HYP_VECTOR_SPECTRE_INDIRECT +1: .org __bp_harden_hyp_vecs + __BP_HARDEN_HYP_VECS_SZ + .org 1b +SYM_CODE_END(__bp_harden_hyp_vecs) diff --git a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h new file mode 100644 index 000000000000..4fdfeabefeb4 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Guest PC manipulation helpers + * + * Copyright (C) 2012,2013 - ARM Ltd + * Copyright (C) 2020 - Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#ifndef __ARM64_KVM_HYP_ADJUST_PC_H__ +#define __ARM64_KVM_HYP_ADJUST_PC_H__ + +#include <asm/kvm_emulate.h> +#include <asm/kvm_host.h> + +static inline void kvm_skip_instr(struct kvm_vcpu *vcpu) +{ + if (vcpu_mode_is_32bit(vcpu)) { + kvm_skip_instr32(vcpu); + } else { + *vcpu_pc(vcpu) += 4; + *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK; + } + + /* advance the singlestep state machine */ + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; +} + +/* + * Skip an instruction which has been emulated at hyp while most guest sysregs + * are live. + */ +static inline void __kvm_skip_instr(struct kvm_vcpu *vcpu) +{ + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + vcpu_gp_regs(vcpu)->pstate = read_sysreg_el2(SYS_SPSR); + + kvm_skip_instr(vcpu); + + write_sysreg_el2(vcpu_gp_regs(vcpu)->pstate, SYS_SPSR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); +} + +/* + * Skip an instruction while host sysregs are live. + * Assumes host is always 64-bit. + */ +static inline void kvm_skip_host_instr(void) +{ + write_sysreg_el2(read_sysreg_el2(SYS_ELR) + 4, SYS_ELR); +} + +#endif diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h new file mode 100644 index 000000000000..502a5b73ee70 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#ifndef __ARM64_KVM_HYP_DEBUG_SR_H__ +#define __ARM64_KVM_HYP_DEBUG_SR_H__ + +#include <linux/compiler.h> +#include <linux/kvm_host.h> + +#include <asm/debug-monitors.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +#define read_debug(r,n) read_sysreg(r##n##_el1) +#define write_debug(v,r,n) write_sysreg(v, r##n##_el1) + +#define save_debug(ptr,reg,nr) \ + switch (nr) { \ + case 15: ptr[15] = read_debug(reg, 15); \ + fallthrough; \ + case 14: ptr[14] = read_debug(reg, 14); \ + fallthrough; \ + case 13: ptr[13] = read_debug(reg, 13); \ + fallthrough; \ + case 12: ptr[12] = read_debug(reg, 12); \ + fallthrough; \ + case 11: ptr[11] = read_debug(reg, 11); \ + fallthrough; \ + case 10: ptr[10] = read_debug(reg, 10); \ + fallthrough; \ + case 9: ptr[9] = read_debug(reg, 9); \ + fallthrough; \ + case 8: ptr[8] = read_debug(reg, 8); \ + fallthrough; \ + case 7: ptr[7] = read_debug(reg, 7); \ + fallthrough; \ + case 6: ptr[6] = read_debug(reg, 6); \ + fallthrough; \ + case 5: ptr[5] = read_debug(reg, 5); \ + fallthrough; \ + case 4: ptr[4] = read_debug(reg, 4); \ + fallthrough; \ + case 3: ptr[3] = read_debug(reg, 3); \ + fallthrough; \ + case 2: ptr[2] = read_debug(reg, 2); \ + fallthrough; \ + case 1: ptr[1] = read_debug(reg, 1); \ + fallthrough; \ + default: ptr[0] = read_debug(reg, 0); \ + } + +#define restore_debug(ptr,reg,nr) \ + switch (nr) { \ + case 15: write_debug(ptr[15], reg, 15); \ + fallthrough; \ + case 14: write_debug(ptr[14], reg, 14); \ + fallthrough; \ + case 13: write_debug(ptr[13], reg, 13); \ + fallthrough; \ + case 12: write_debug(ptr[12], reg, 12); \ + fallthrough; \ + case 11: write_debug(ptr[11], reg, 11); \ + fallthrough; \ + case 10: write_debug(ptr[10], reg, 10); \ + fallthrough; \ + case 9: write_debug(ptr[9], reg, 9); \ + fallthrough; \ + case 8: write_debug(ptr[8], reg, 8); \ + fallthrough; \ + case 7: write_debug(ptr[7], reg, 7); \ + fallthrough; \ + case 6: write_debug(ptr[6], reg, 6); \ + fallthrough; \ + case 5: write_debug(ptr[5], reg, 5); \ + fallthrough; \ + case 4: write_debug(ptr[4], reg, 4); \ + fallthrough; \ + case 3: write_debug(ptr[3], reg, 3); \ + fallthrough; \ + case 2: write_debug(ptr[2], reg, 2); \ + fallthrough; \ + case 1: write_debug(ptr[1], reg, 1); \ + fallthrough; \ + default: write_debug(ptr[0], reg, 0); \ + } + +static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.debug_owner) { + case VCPU_DEBUG_FREE: + WARN_ON_ONCE(1); + fallthrough; + case VCPU_DEBUG_GUEST_OWNED: + return &vcpu->arch.vcpu_debug_state; + case VCPU_DEBUG_HOST_OWNED: + return &vcpu->arch.external_debug_state; + } + + return NULL; +} + +static void __debug_save_state(struct kvm_guest_debug_arch *dbg, + struct kvm_cpu_context *ctxt) +{ + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); + + save_debug(dbg->dbg_bcr, dbgbcr, brps); + save_debug(dbg->dbg_bvr, dbgbvr, brps); + save_debug(dbg->dbg_wcr, dbgwcr, wrps); + save_debug(dbg->dbg_wvr, dbgwvr, wrps); + + ctxt_sys_reg(ctxt, MDCCINT_EL1) = read_sysreg(mdccint_el1); +} + +static void __debug_restore_state(struct kvm_guest_debug_arch *dbg, + struct kvm_cpu_context *ctxt) +{ + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); + + restore_debug(dbg->dbg_bcr, dbgbcr, brps); + restore_debug(dbg->dbg_bvr, dbgbvr, brps); + restore_debug(dbg->dbg_wcr, dbgwcr, wrps); + restore_debug(dbg->dbg_wvr, dbgwvr, wrps); + + write_sysreg(ctxt_sys_reg(ctxt, MDCCINT_EL1), mdccint_el1); +} + +static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *host_ctxt; + struct kvm_cpu_context *guest_ctxt; + struct kvm_guest_debug_arch *host_dbg; + struct kvm_guest_debug_arch *guest_dbg; + + if (!kvm_debug_regs_in_use(vcpu)) + return; + + host_ctxt = host_data_ptr(host_ctxt); + guest_ctxt = &vcpu->arch.ctxt; + host_dbg = host_data_ptr(host_debug_state.regs); + guest_dbg = __vcpu_debug_regs(vcpu); + + __debug_save_state(host_dbg, host_ctxt); + __debug_restore_state(guest_dbg, guest_ctxt); +} + +static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *host_ctxt; + struct kvm_cpu_context *guest_ctxt; + struct kvm_guest_debug_arch *host_dbg; + struct kvm_guest_debug_arch *guest_dbg; + + if (!kvm_debug_regs_in_use(vcpu)) + return; + + host_ctxt = host_data_ptr(host_ctxt); + guest_ctxt = &vcpu->arch.ctxt; + host_dbg = host_data_ptr(host_debug_state.regs); + guest_dbg = __vcpu_debug_regs(vcpu); + + __debug_save_state(guest_dbg, guest_ctxt); + __debug_restore_state(host_dbg, host_ctxt); +} + +#endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h new file mode 100644 index 000000000000..fc573fc767b0 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#ifndef __ARM64_KVM_HYP_FAULT_H__ +#define __ARM64_KVM_HYP_FAULT_H__ + +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +static inline bool __fault_safe_to_translate(u64 esr) +{ + u64 fsc = esr & ESR_ELx_FSC; + + if (esr_fsc_is_sea_ttw(esr) || esr_fsc_is_secc_ttw(esr)) + return false; + + return !(fsc == ESR_ELx_FSC_EXTABT && (esr & ESR_ELx_FnV)); +} + +static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) +{ + int ret; + u64 par, tmp; + + /* + * Resolve the IPA the hard way using the guest VA. + * + * Stage-1 translation already validated the memory access + * rights. As such, we can use the EL1 translation regime, and + * don't have to distinguish between EL0 and EL1 access. + * + * We do need to save/restore PAR_EL1 though, as we haven't + * saved the guest context yet, and we may return early... + */ + par = read_sysreg_par(); + ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) : + __kvm_at(OP_AT_S1E1R, far); + if (!ret) + tmp = read_sysreg_par(); + else + tmp = SYS_PAR_EL1_F; /* back to the guest */ + write_sysreg(par, par_el1); + + if (unlikely(tmp & SYS_PAR_EL1_F)) + return false; /* Translation failed, back to guest */ + + /* Convert PAR to HPFAR format */ + *hpfar = PAR_TO_HPFAR(tmp); + return true; +} + +/* + * Checks for the conditions when HPFAR_EL2 is written, per ARM ARM R_FKLWR. + */ +static inline bool __hpfar_valid(u64 esr) +{ + /* + * CPUs affected by ARM erratum #834220 may incorrectly report a + * stage-2 translation fault when a stage-1 permission fault occurs. + * + * Re-walk the page tables to determine if a stage-1 fault actually + * occurred. + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_834220) && + esr_fsc_is_translation_fault(esr)) + return false; + + if (esr_fsc_is_translation_fault(esr) || esr_fsc_is_access_flag_fault(esr)) + return true; + + if ((esr & ESR_ELx_S1PTW) && esr_fsc_is_permission_fault(esr)) + return true; + + return esr_fsc_is_addr_sz_fault(esr); +} + +static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +{ + u64 hpfar; + + fault->far_el2 = read_sysreg_el2(SYS_FAR); + fault->hpfar_el2 = 0; + + if (__hpfar_valid(esr)) + hpfar = read_sysreg(hpfar_el2); + else if (unlikely(!__fault_safe_to_translate(esr))) + return true; + else if (!__translate_far_to_hpfar(fault->far_el2, &hpfar)) + return false; + + /* + * Hijack HPFAR_EL2.NS (RES0 in Non-secure) to indicate a valid + * HPFAR value. + */ + fault->hpfar_el2 = hpfar | HPFAR_EL2_NS; + return true; +} + +#endif diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h new file mode 100644 index 000000000000..c5d5e5b86eaf --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -0,0 +1,952 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#ifndef __ARM64_KVM_HYP_SWITCH_H__ +#define __ARM64_KVM_HYP_SWITCH_H__ + +#include <hyp/adjust_pc.h> +#include <hyp/fault.h> + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> +#include <linux/types.h> +#include <linux/jump_label.h> +#include <uapi/linux/psci.h> + +#include <kvm/arm_psci.h> + +#include <asm/barrier.h> +#include <asm/cpufeature.h> +#include <asm/extable.h> +#include <asm/kprobes.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> +#include <asm/fpsimd.h> +#include <asm/debug-monitors.h> +#include <asm/processor.h> +#include <asm/traps.h> + +struct kvm_exception_table_entry { + int insn, fixup; +}; + +extern struct kvm_exception_table_entry __start___kvm_ex_table; +extern struct kvm_exception_table_entry __stop___kvm_ex_table; + +/* Save the 32-bit only FPSIMD system register state */ +static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu) +{ + if (!vcpu_el1_is_32bit(vcpu)) + return; + + __vcpu_assign_sys_reg(vcpu, FPEXC32_EL2, read_sysreg(fpexc32_el2)); +} + +static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) +{ + /* + * We are about to set CPTR_EL2.TFP to trap all floating point + * register accesses to EL2, however, the ARM ARM clearly states that + * traps are only taken to EL2 if the operation would not otherwise + * trap to EL1. Therefore, always make sure that for 32-bit guests, + * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit. + * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to + * it will cause an exception. + */ + if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd()) { + write_sysreg(1 << 30, fpexc32_el2); + isb(); + } +} + +static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); +} + +static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + /* + * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to + * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, + * except for some missing controls, such as TAM. + * In this case, CPTR_EL2.TAM has the same position with or without + * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM + * shift value for trapping the AMU accesses. + */ + u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA; + u64 cptr; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } + + if (!vcpu_has_nv(vcpu)) + goto write; + + /* + * The architecture is a bit crap (what a surprise): an EL2 guest + * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, + * as they are RES0 in the guest's view. To work around it, trap the + * sucker using the very same bit it can't set... + */ + if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) + val |= CPTR_EL2_TCPAC; + + /* + * Layer the guest hypervisor's trap configuration on top of our own if + * we're in a nested context. + */ + if (is_hyp_ctxt(vcpu)) + goto write; + + cptr = vcpu_sanitised_cptr_el2(vcpu); + + /* + * Pay attention, there's some interesting detail here. + * + * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two + * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): + * + * - CPTR_EL2.xEN = x0, traps are enabled + * - CPTR_EL2.xEN = x1, traps are disabled + * + * In other words, bit[0] determines if guest accesses trap or not. In + * the interest of simplicity, clear the entire field if the guest + * hypervisor has traps enabled to dispel any illusion of something more + * complicated taking place. + */ + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; + + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + val |= cptr & CPACR_EL1_E0POE; + + val |= cptr & CPTR_EL2_TCPAC; + +write: + write_sysreg(val, cpacr_el1); +} + +static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + + if (has_vhe() || has_hvhe()) + __activate_cptr_traps_vhe(vcpu); + else + __activate_cptr_traps_nvhe(vcpu); +} + +static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1; + + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); +} + +static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN; + + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); +} + +static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_vhe() || has_hvhe()) + __deactivate_cptr_traps_vhe(vcpu); + else + __deactivate_cptr_traps_nvhe(vcpu); +} + +static inline bool cpu_has_amu(void) +{ + u64 pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); + + return cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_EL1_AMU_SHIFT); +} + +#define __activate_fgt(hctxt, vcpu, reg) \ + do { \ + ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ + write_sysreg_s(*vcpu_fgt(vcpu, reg), SYS_ ## reg); \ + } while (0) + +static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + return; + + __activate_fgt(hctxt, vcpu, HFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HFGWTR_EL2); + __activate_fgt(hctxt, vcpu, HFGITR_EL2); + __activate_fgt(hctxt, vcpu, HDFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HDFGWTR_EL2); + + if (cpu_has_amu()) + __activate_fgt(hctxt, vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __activate_fgt(hctxt, vcpu, HFGRTR2_EL2); + __activate_fgt(hctxt, vcpu, HFGWTR2_EL2); + __activate_fgt(hctxt, vcpu, HFGITR2_EL2); + __activate_fgt(hctxt, vcpu, HDFGRTR2_EL2); + __activate_fgt(hctxt, vcpu, HDFGWTR2_EL2); +} + +#define __deactivate_fgt(htcxt, vcpu, reg) \ + do { \ + write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ + SYS_ ## reg); \ + } while(0) + +static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + return; + + __deactivate_fgt(hctxt, vcpu, HFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR_EL2); + + if (cpu_has_amu()) + __deactivate_fgt(hctxt, vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __deactivate_fgt(hctxt, vcpu, HFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2); +} + +static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) +{ + u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + + if (!system_supports_mpam()) + return; + + /* trap guest access to MPAMIDR_EL1 */ + if (system_supports_mpam_hcr()) { + write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); + } else { + /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ + r |= MPAM2_EL2_TIDR; + } + + write_sysreg_s(r, SYS_MPAM2_EL2); +} + +static inline void __deactivate_traps_mpam(void) +{ + if (!system_supports_mpam()) + return; + + write_sysreg_s(0, SYS_MPAM2_EL2); + + if (system_supports_mpam_hcr()) + write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); +} + +static inline void __activate_traps_common(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ + write_sysreg(1 << 15, hstr_el2); + + /* + * Make sure we trap PMU access from EL0 to EL2. Also sanitize + * PMSELR_EL0 to make sure it never contains the cycle + * counter, which could make a PMXEVCNTR_EL0 access UNDEF at + * EL1 instead of being trapped to EL2. + */ + if (system_supports_pmuv3()) { + write_sysreg(0, pmselr_el0); + + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); + write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); + } + + if (cpus_have_final_cap(ARM64_HAS_HCX)) { + u64 hcrx = vcpu->arch.hcrx_el2; + if (is_nested_ctxt(vcpu)) { + u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2); + hcrx |= val & __HCRX_EL2_MASK; + hcrx &= ~(~val & __HCRX_EL2_nMASK); + } + + ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2); + write_sysreg_s(hcrx, SYS_HCRX_EL2); + } + + __activate_traps_hfgxtr(vcpu); + __activate_traps_mpam(vcpu); +} + +static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + write_sysreg(0, hstr_el2); + if (system_supports_pmuv3()) { + write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); + vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); + } + + if (cpus_have_final_cap(ARM64_HAS_HCX)) + write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2); + + __deactivate_traps_hfgxtr(vcpu); + __deactivate_traps_mpam(); +} + +static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr) +{ + if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM)) + hcr |= HCR_TVM; + + write_sysreg_hcr(hcr); + + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) { + u64 vsesr; + + /* + * When HCR_EL2.AMO is set, physical SErrors are taken to EL2 + * and vSError injection is enabled for EL1. Conveniently, for + * NV this means that it is never the case where a 'physical' + * SError (injected by KVM or userspace) and vSError are + * deliverable to the same context. + * + * As such, we can trivially select between the host or guest's + * VSESR_EL2. Except for the case that FEAT_RAS hasn't been + * exposed to the guest, where ESR propagation in hardware + * occurs unconditionally. + * + * Paper over the architectural wart and use an IMPLEMENTATION + * DEFINED ESR value in case FEAT_RAS is hidden from the guest. + */ + if (!vserror_state_is_nested(vcpu)) + vsesr = vcpu->arch.vsesr_el2; + else if (kvm_has_ras(kern_hyp_va(vcpu->kvm))) + vsesr = __vcpu_sys_reg(vcpu, VSESR_EL2); + else + vsesr = ESR_ELx_ISV; + + write_sysreg_s(vsesr, SYS_VSESR_EL2); + } +} + +static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) +{ + u64 *hcr; + + if (vserror_state_is_nested(vcpu)) + hcr = __ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2); + else + hcr = &vcpu->arch.hcr_el2; + + /* + * If we pended a virtual abort, preserve it until it gets + * cleared. See D1.14.3 (Virtual Interrupts) for details, but + * the crucial bit is "On taking a vSError interrupt, + * HCR_EL2.VSE is cleared to 0." + * + * Additionally, when in a nested context we need to propagate the + * updated state to the guest hypervisor's HCR_EL2. + */ + if (*hcr & HCR_VSE) { + *hcr &= ~HCR_VSE; + *hcr |= read_sysreg(hcr_el2) & HCR_VSE; + } +} + +static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) +{ + return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); +} + +static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + + /* + * Finish potential single step before executing the prologue + * instruction. + */ + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); + + return true; +} + +static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) +{ + /* + * The vCPU's saved SVE state layout always matches the max VL of the + * vCPU. Start off with the max VL so we can load the SVE state. + */ + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + __sve_restore_state(vcpu_sve_pffr(vcpu), + &vcpu->arch.ctxt.fp_regs.fpsr, + true); + + /* + * The effective VL for a VM could differ from the max VL when running a + * nested guest, as the guest hypervisor could select a smaller VL. Slap + * that into hardware before wrapping up. + */ + if (is_nested_ctxt(vcpu)) + sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2); + + write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR); +} + +static inline void __hyp_sve_save_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR); + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); + __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); +} + +static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + if (vcpu_has_sve(vcpu)) { + /* A guest hypervisor may restrict the effective max VL. */ + if (is_nested_ctxt(vcpu)) + zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); + else + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); + write_sysreg_el1(zcr_el1, SYS_ZCR); + } +} + +static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + /* + * When the guest owns the FP regs, we know that guest+hyp traps for + * any FPSIMD/SVE/SME features exposed to the guest have been disabled + * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() + * prior to __guest_entry(). As __guest_entry() guarantees a context + * synchronization event, we don't need an ISB here to avoid taking + * traps for anything that was exposed to the guest. + */ + if (vcpu_has_sve(vcpu)) { + zcr_el1 = read_sysreg_el1(SYS_ZCR); + __vcpu_assign_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu), zcr_el1); + + /* + * The guest's state is always saved using the guest's max VL. + * Ensure that the host has the guest's max VL active such that + * the host can save the guest's state lazily, but don't + * artificially restrict the host to the guest's max VL. + */ + if (has_vhe()) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + } else { + zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el1(zcr_el1, SYS_ZCR); + } + } +} + +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (system_supports_sve()) { + __hyp_sve_save_host(); + } else { + __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); + } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); +} + + +/* + * We trap the first access to the FP/SIMD to save the host context and + * restore the guest context lazily. + * If FP/SIMD is not implemented, handle the trap and inject an undefined + * instruction exception to the guest. Similarly for trapped SVE accesses. + */ +static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + bool sve_guest; + u8 esr_ec; + + if (!system_supports_fpsimd()) + return false; + + sve_guest = vcpu_has_sve(vcpu); + esr_ec = kvm_vcpu_trap_get_class(vcpu); + + /* Only handle traps the vCPU can support here: */ + switch (esr_ec) { + case ESR_ELx_EC_FP_ASIMD: + /* Forward traps to the guest hypervisor as required */ + if (guest_hyp_fpsimd_traps_enabled(vcpu)) + return false; + break; + case ESR_ELx_EC_SYS64: + if (WARN_ON_ONCE(!is_hyp_ctxt(vcpu))) + return false; + fallthrough; + case ESR_ELx_EC_SVE: + if (!sve_guest) + return false; + if (guest_hyp_sve_traps_enabled(vcpu)) + return false; + break; + default: + return false; + } + + /* Valid trap. Switch the context: */ + + /* First disable enough traps to allow us to update the registers */ + __deactivate_cptr_traps(vcpu); + isb(); + + /* Write out the host state if it's in the registers */ + if (is_protected_kvm_enabled() && host_owns_fp_regs()) + kvm_hyp_save_fpsimd_host(vcpu); + + /* Restore the guest state */ + if (sve_guest) + __hyp_sve_restore_guest(vcpu); + else + __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs); + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR); + + /* Skip restoring fpexc32 for AArch64 guests */ + if (!(read_sysreg(hcr_el2) & HCR_RW)) + write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2); + + *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED; + + /* + * Re-enable traps necessary for the current state of the guest, e.g. + * those enabled by a guest hypervisor. The ERET to the guest will + * provide the necessary context synchronization. + */ + __activate_cptr_traps(vcpu); + + return true; +} + +static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + int rt = kvm_vcpu_sys_get_rt(vcpu); + u64 val = vcpu_get_reg(vcpu, rt); + + /* + * The normal sysreg handling code expects to see the traps, + * let's not do anything here. + */ + if (vcpu->arch.hcr_el2 & HCR_TVM) + return false; + + switch (sysreg) { + case SYS_SCTLR_EL1: + write_sysreg_el1(val, SYS_SCTLR); + break; + case SYS_TTBR0_EL1: + write_sysreg_el1(val, SYS_TTBR0); + break; + case SYS_TTBR1_EL1: + write_sysreg_el1(val, SYS_TTBR1); + break; + case SYS_TCR_EL1: + write_sysreg_el1(val, SYS_TCR); + break; + case SYS_ESR_EL1: + write_sysreg_el1(val, SYS_ESR); + break; + case SYS_FAR_EL1: + write_sysreg_el1(val, SYS_FAR); + break; + case SYS_AFSR0_EL1: + write_sysreg_el1(val, SYS_AFSR0); + break; + case SYS_AFSR1_EL1: + write_sysreg_el1(val, SYS_AFSR1); + break; + case SYS_MAIR_EL1: + write_sysreg_el1(val, SYS_MAIR); + break; + case SYS_AMAIR_EL1: + write_sysreg_el1(val, SYS_AMAIR); + break; + case SYS_CONTEXTIDR_EL1: + write_sysreg_el1(val, SYS_CONTEXTIDR); + break; + default: + return false; + } + + __kvm_skip_instr(vcpu); + return true; +} + +/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */ +static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt) +{ + u64 offset = 0; + + if (ctxt->offset.vm_offset) + offset += *kern_hyp_va(ctxt->offset.vm_offset); + if (ctxt->offset.vcpu_offset) + offset += *kern_hyp_va(ctxt->offset.vcpu_offset); + + return offset; +} + +static inline u64 compute_counter_value(struct arch_timer_context *ctxt) +{ + return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt); +} + +static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *ctxt; + u32 sysreg; + u64 val; + + /* + * We only get here for 64bit guests, 32bit guests will hit + * the long and winding road all the way to the standard + * handling. Yes, it sucks to be irrelevant. + * + * Also, we only deal with non-hypervisor context here (either + * an EL1 guest, or a non-HYP context of an EL2 guest). + */ + if (is_hyp_ctxt(vcpu)) + return false; + + sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + + switch (sysreg) { + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + if (!vcpu_el2_e2h_is_set(vcpu)) + val = (val & CNTHCTL_EL1PCTEN) << 10; + + if (!(val & (CNTHCTL_EL1PCTEN << 10))) + return false; + } + + ctxt = vcpu_ptimer(vcpu); + break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + if (val & CNTHCTL_EL1TVCT) + return false; + } + + ctxt = vcpu_vtimer(vcpu); + break; + default: + return false; + } + + val = compute_counter_value(ctxt); + + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + return true; +} + +static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + int rt = kvm_vcpu_sys_get_rt(vcpu); + u64 val = vcpu_get_reg(vcpu, rt); + + if (sysreg != SYS_TCR_EL1) + return false; + + /* + * Affected parts do not advertise support for hardware Access Flag / + * Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying + * control bits are still functional. The architecture requires these be + * RES0 on systems that do not implement FEAT_HAFDBS. + * + * Uphold the requirements of the architecture by masking guest writes + * to TCR_EL1.{HA,HD} here. + */ + val &= ~(TCR_HD | TCR_HA); + write_sysreg_el1(val, SYS_TCR); + __kvm_skip_instr(vcpu); + return true; +} + +static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && + handle_tx2_tvm(vcpu)) + return true; + + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) && + handle_ampere1_tcr(vcpu)) + return true; + + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + if (kvm_handle_cntxct(vcpu)) + return true; + + return false; +} + +static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + return false; +} + +static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, + u64 *exit_code) +{ + if (!__populate_fault_info(vcpu)) + return true; + + return false; +} +#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault +#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault + +static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (kvm_hyp_handle_memory_fault(vcpu, exit_code)) + return true; + + if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { + bool valid; + + valid = kvm_vcpu_trap_is_translation_fault(vcpu) && + kvm_vcpu_dabt_isvalid(vcpu) && + !kvm_vcpu_abt_issea(vcpu) && + !kvm_vcpu_abt_iss1tw(vcpu); + + if (valid) { + int ret = __vgic_v2_perform_cpuif_access(vcpu); + + if (ret == 1) + return true; + + /* Promote an illegal access to an SError.*/ + if (ret == -1) + *exit_code = ARM_EXCEPTION_EL1_SERROR; + } + } + + return false; +} + +typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); + +/* + * Allow the hypervisor to handle the exit with an exit handler if it has one. + * + * Returns true if the hypervisor handled the exit, and control should go back + * to the guest, or false if it hasn't. + */ +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) +{ + exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; + if (fn) + return fn(vcpu, exit_code); + + return false; +} + +static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* + * Check for the conditions of Cortex-A510's #2077057. When these occur + * SPSR_EL2 can't be trusted, but isn't needed either as it is + * unchanged from the value in vcpu_gp_regs(vcpu)->pstate. + * Are we single-stepping the guest, and took a PAC exception from the + * active-not-pending state? + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_2077057) && + vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + *vcpu_cpsr(vcpu) & DBG_SPSR_SS && + ESR_ELx_EC(read_sysreg_el2(SYS_ESR)) == ESR_ELx_EC_PAC) + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); + + vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); +} + +/* + * Return true when we were able to fixup the guest exit and should return to + * the guest, false when we should restore the host state and return to the + * main run loop. + */ +static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) +{ + if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) + vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); + + if (ARM_SERROR_PENDING(*exit_code) && + ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) { + u8 esr_ec = kvm_vcpu_trap_get_class(vcpu); + + /* + * HVC already have an adjusted PC, which we need to + * correct in order to return to after having injected + * the SError. + * + * SMC, on the other hand, is *trapped*, meaning its + * preferred return address is the SMC itself. + */ + if (esr_ec == ESR_ELx_EC_HVC32 || esr_ec == ESR_ELx_EC_HVC64) + write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR); + } + + /* + * We're using the raw exception code in order to only process + * the trap if no SError is pending. We will come back to the + * same PC once the SError has been injected, and replay the + * trapping instruction. + */ + if (*exit_code != ARM_EXCEPTION_TRAP) + goto exit; + + /* Check if there's an exit handler and allow it to handle the exit. */ + if (kvm_hyp_handle_exit(vcpu, exit_code, handlers)) + goto guest; +exit: + /* Return to the host kernel and handle the exit */ + return false; + +guest: + /* Re-enter the guest */ + asm(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412)); + return true; +} + +static inline void __kvm_unexpected_el2_exception(void) +{ + extern char __guest_exit_restore_elr_and_panic[]; + unsigned long addr, fixup; + struct kvm_exception_table_entry *entry, *end; + unsigned long elr_el2 = read_sysreg(elr_el2); + + entry = &__start___kvm_ex_table; + end = &__stop___kvm_ex_table; + + while (entry < end) { + addr = (unsigned long)&entry->insn + entry->insn; + fixup = (unsigned long)&entry->fixup + entry->fixup; + + if (addr != elr_el2) { + entry++; + continue; + } + + write_sysreg(fixup, elr_el2); + return; + } + + /* Trigger a panic after restoring the hyp context. */ + this_cpu_ptr(&kvm_hyp_ctxt)->sys_regs[ELR_EL2] = elr_el2; + write_sysreg(__guest_exit_restore_elr_and_panic, elr_el2); +} + +#endif /* __ARM64_KVM_HYP_SWITCH_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h new file mode 100644 index 000000000000..a17cbe7582de --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#ifndef __ARM64_KVM_HYP_SYSREG_SR_H__ +#define __ARM64_KVM_HYP_SYSREG_SR_H__ + +#include <linux/compiler.h> +#include <linux/kvm_host.h> + +#include <asm/kprobes.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); + +static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; + + if (!vcpu) + vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); + + return vcpu; +} + +static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt) +{ + return host_data_ptr(host_ctxt) != ctxt; +} + +static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); + + if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu)) + return &vcpu->arch.external_mdscr_el1; + + return &ctxt_sys_reg(ctxt, MDSCR_EL1); +} + +static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm); + + if (!(ctxt_is_guest(ctxt) && + test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))) + return read_cpuid_id(); + + return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1); +} + +static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) +{ + *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0); +} + +static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) +{ + ctxt_sys_reg(ctxt, TPIDR_EL0) = read_sysreg(tpidr_el0); + ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); +} + +static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); + + return kvm_has_mte(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_S1PIE)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_s1pie(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_TCR2)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_tcr2(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!system_supports_poe()) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_s1poe(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_ras(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_ras(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_sctlr2(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_SCTLR2)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_sctlr2(kern_hyp_va(vcpu->kvm)); +} + +static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) +{ + ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); + ctxt_sys_reg(ctxt, CPACR_EL1) = read_sysreg_el1(SYS_CPACR); + ctxt_sys_reg(ctxt, TTBR0_EL1) = read_sysreg_el1(SYS_TTBR0); + ctxt_sys_reg(ctxt, TTBR1_EL1) = read_sysreg_el1(SYS_TTBR1); + ctxt_sys_reg(ctxt, TCR_EL1) = read_sysreg_el1(SYS_TCR); + if (ctxt_has_tcrx(ctxt)) { + ctxt_sys_reg(ctxt, TCR2_EL1) = read_sysreg_el1(SYS_TCR2); + + if (ctxt_has_s1pie(ctxt)) { + ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); + ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); + } + + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR); + } + ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR); + ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0); + ctxt_sys_reg(ctxt, AFSR1_EL1) = read_sysreg_el1(SYS_AFSR1); + ctxt_sys_reg(ctxt, FAR_EL1) = read_sysreg_el1(SYS_FAR); + ctxt_sys_reg(ctxt, MAIR_EL1) = read_sysreg_el1(SYS_MAIR); + ctxt_sys_reg(ctxt, VBAR_EL1) = read_sysreg_el1(SYS_VBAR); + ctxt_sys_reg(ctxt, CONTEXTIDR_EL1) = read_sysreg_el1(SYS_CONTEXTIDR); + ctxt_sys_reg(ctxt, AMAIR_EL1) = read_sysreg_el1(SYS_AMAIR); + ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL); + ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par(); + ctxt_sys_reg(ctxt, TPIDR_EL1) = read_sysreg(tpidr_el1); + + if (ctxt_has_mte(ctxt)) { + ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR); + ctxt_sys_reg(ctxt, TFSRE0_EL1) = read_sysreg_s(SYS_TFSRE0_EL1); + } + + ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1); + ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR); + ctxt_sys_reg(ctxt, SPSR_EL1) = read_sysreg_el1(SYS_SPSR); + + if (ctxt_has_sctlr2(ctxt)) + ctxt_sys_reg(ctxt, SCTLR2_EL1) = read_sysreg_el1(SYS_SCTLR2); +} + +static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) +{ + ctxt->regs.pc = read_sysreg_el2(SYS_ELR); + /* + * Guest PSTATE gets saved at guest fixup time in all + * cases. We still need to handle the nVHE host side here. + */ + if (!has_vhe() && ctxt->__hyp_running_vcpu) + ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); + + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return; + + if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt))) + ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2); + else if (ctxt_has_ras(ctxt)) + ctxt_sys_reg(ctxt, VDISR_EL2) = read_sysreg_s(SYS_VDISR_EL2); +} + +static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) +{ + write_sysreg(*ctxt_mdscr_el1(ctxt), mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0); +} + +static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) +{ + write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL0), tpidr_el0); + write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0), tpidrro_el0); +} + +static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, + u64 midr, u64 mpidr) +{ + write_sysreg(midr, vpidr_el2); + write_sysreg(mpidr, vmpidr_el2); + + if (has_vhe() || + !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { + write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL1), SYS_SCTLR); + write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL1), SYS_TCR); + } else if (!ctxt->__hyp_running_vcpu) { + /* + * Must only be done for guest registers, hence the context + * test. We're coming from the host, so SCTLR.M is already + * set. Pairs with nVHE's __activate_traps(). + */ + write_sysreg_el1((ctxt_sys_reg(ctxt, TCR_EL1) | + TCR_EPD1_MASK | TCR_EPD0_MASK), + SYS_TCR); + isb(); + } + + write_sysreg_el1(ctxt_sys_reg(ctxt, CPACR_EL1), SYS_CPACR); + write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL1), SYS_TTBR0); + write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL1), SYS_TTBR1); + if (ctxt_has_tcrx(ctxt)) { + write_sysreg_el1(ctxt_sys_reg(ctxt, TCR2_EL1), SYS_TCR2); + + if (ctxt_has_s1pie(ctxt)) { + write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); + write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); + } + + if (ctxt_has_s1poe(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR); + } + write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR); + write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0); + write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR1_EL1), SYS_AFSR1); + write_sysreg_el1(ctxt_sys_reg(ctxt, FAR_EL1), SYS_FAR); + write_sysreg_el1(ctxt_sys_reg(ctxt, MAIR_EL1), SYS_MAIR); + write_sysreg_el1(ctxt_sys_reg(ctxt, VBAR_EL1), SYS_VBAR); + write_sysreg_el1(ctxt_sys_reg(ctxt, CONTEXTIDR_EL1), SYS_CONTEXTIDR); + write_sysreg_el1(ctxt_sys_reg(ctxt, AMAIR_EL1), SYS_AMAIR); + write_sysreg_el1(ctxt_sys_reg(ctxt, CNTKCTL_EL1), SYS_CNTKCTL); + write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1), par_el1); + write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1), tpidr_el1); + + if (ctxt_has_mte(ctxt)) { + write_sysreg_el1(ctxt_sys_reg(ctxt, TFSR_EL1), SYS_TFSR); + write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1); + } + + if (!has_vhe() && + cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) && + ctxt->__hyp_running_vcpu) { + /* + * Must only be done for host registers, hence the context + * test. Pairs with nVHE's __deactivate_traps(). + */ + isb(); + /* + * At this stage, and thanks to the above isb(), S2 is + * deconfigured and disabled. We can now restore the host's + * S1 configuration: SCTLR, and only then TCR. + */ + write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL1), SYS_SCTLR); + isb(); + write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL1), SYS_TCR); + } + + write_sysreg(ctxt_sys_reg(ctxt, SP_EL1), sp_el1); + write_sysreg_el1(ctxt_sys_reg(ctxt, ELR_EL1), SYS_ELR); + write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1), SYS_SPSR); + + if (ctxt_has_sctlr2(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR2_EL1), SYS_SCTLR2); +} + +/* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */ +static inline u64 to_hw_pstate(const struct kvm_cpu_context *ctxt) +{ + u64 mode = ctxt->regs.pstate & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL2t: + mode = PSR_MODE_EL1t; + break; + case PSR_MODE_EL2h: + mode = PSR_MODE_EL1h; + break; + } + + return (ctxt->regs.pstate & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; +} + +static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) +{ + u64 pstate = to_hw_pstate(ctxt); + u64 mode = pstate & PSR_AA32_MODE_MASK; + u64 vdisr; + + /* + * Safety check to ensure we're setting the CPU up to enter the guest + * in a less privileged mode. + * + * If we are attempting a return to EL2 or higher in AArch64 state, + * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that + * we'll take an illegal exception state exception immediately after + * the ERET to the guest. Attempts to return to AArch32 Hyp will + * result in an illegal exception return because EL2's execution state + * is determined by SCR_EL3.RW. + */ + if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t) + pstate = PSR_MODE_EL2h | PSR_IL_BIT; + + write_sysreg_el2(ctxt->regs.pc, SYS_ELR); + write_sysreg_el2(pstate, SYS_SPSR); + + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return; + + if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt))) + vdisr = ctxt_sys_reg(ctxt, DISR_EL1); + else if (ctxt_has_ras(ctxt)) + vdisr = ctxt_sys_reg(ctxt, VDISR_EL2); + else + vdisr = 0; + + write_sysreg_s(vdisr, SYS_VDISR_EL2); +} + +static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) +{ + if (!vcpu_el1_is_32bit(vcpu)) + return; + + vcpu->arch.ctxt.spsr_abt = read_sysreg(spsr_abt); + vcpu->arch.ctxt.spsr_und = read_sysreg(spsr_und); + vcpu->arch.ctxt.spsr_irq = read_sysreg(spsr_irq); + vcpu->arch.ctxt.spsr_fiq = read_sysreg(spsr_fiq); + + __vcpu_assign_sys_reg(vcpu, DACR32_EL2, read_sysreg(dacr32_el2)); + __vcpu_assign_sys_reg(vcpu, IFSR32_EL2, read_sysreg(ifsr32_el2)); + + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) + __vcpu_assign_sys_reg(vcpu, DBGVCR32_EL2, read_sysreg(dbgvcr32_el2)); +} + +static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) +{ + if (!vcpu_el1_is_32bit(vcpu)) + return; + + write_sysreg(vcpu->arch.ctxt.spsr_abt, spsr_abt); + write_sysreg(vcpu->arch.ctxt.spsr_und, spsr_und); + write_sysreg(vcpu->arch.ctxt.spsr_irq, spsr_irq); + write_sysreg(vcpu->arch.ctxt.spsr_fiq, spsr_fiq); + + write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); + write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); + + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) + write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); +} + +#endif /* __ARM64_KVM_HYP_SYSREG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h new file mode 100644 index 000000000000..dc61aaa56f31 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_HYP_EARLY_ALLOC_H +#define __KVM_HYP_EARLY_ALLOC_H + +#include <asm/kvm_pgtable.h> + +void hyp_early_alloc_init(void *virt, unsigned long size); +unsigned long hyp_early_alloc_nr_used_pages(void); +void *hyp_early_alloc_page(void *arg); +void *hyp_early_alloc_contig(unsigned int nr_pages); + +extern struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops; + +#endif /* __KVM_HYP_EARLY_ALLOC_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h new file mode 100644 index 000000000000..146e0aebfa1c --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran <qwandor@google.com> + */ +#ifndef __KVM_HYP_FFA_H +#define __KVM_HYP_FFA_H + +#include <asm/kvm_host.h> + +#define FFA_MIN_FUNC_NUM 0x60 +#define FFA_MAX_FUNC_NUM 0xFF + +int hyp_ffa_init(void *pages); +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id); + +#endif /* __KVM_HYP_FFA_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h new file mode 100644 index 000000000000..3766333bace9 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_HYP_GFP_H +#define __KVM_HYP_GFP_H + +#include <linux/list.h> + +#include <nvhe/memory.h> +#include <nvhe/spinlock.h> + +#define HYP_NO_ORDER ((u8)(~0)) + +struct hyp_pool { + /* + * Spinlock protecting concurrent changes to the memory pool as well as + * the struct hyp_page of the pool's pages until we have a proper atomic + * API at EL2. + */ + hyp_spinlock_t lock; + struct list_head free_area[NR_PAGE_ORDERS]; + phys_addr_t range_start; + phys_addr_t range_end; + u8 max_order; +}; + +/* Allocation */ +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order); +void hyp_split_page(struct hyp_page *page); +void hyp_get_page(struct hyp_pool *pool, void *addr); +void hyp_put_page(struct hyp_pool *pool, void *addr); + +/* Used pages cannot be freed */ +int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, + unsigned int reserved_pages); +#endif /* __KVM_HYP_GFP_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h new file mode 100644 index 000000000000..5f9d56754e39 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#ifndef __KVM_NVHE_MEM_PROTECT__ +#define __KVM_NVHE_MEM_PROTECT__ +#include <linux/kvm_host.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/virt.h> +#include <nvhe/memory.h> +#include <nvhe/pkvm.h> +#include <nvhe/spinlock.h> + +struct host_mmu { + struct kvm_arch arch; + struct kvm_pgtable pgt; + struct kvm_pgtable_mm_ops mm_ops; + hyp_spinlock_t lock; +}; +extern struct host_mmu host_mmu; + +/* This corresponds to page-table locking order */ +enum pkvm_component_id { + PKVM_ID_HOST, + PKVM_ID_HYP, + PKVM_ID_FFA, +}; + +extern unsigned long hyp_nr_cpus; + +int __pkvm_prot_finalize(void); +int __pkvm_host_share_hyp(u64 pfn); +int __pkvm_host_unshare_hyp(u64 pfn); +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot); +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm); +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu); + +bool addr_is_memory(phys_addr_t phys); +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); +int kvm_host_prepare_stage2(void *pgt_pool_base); +int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd); +void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); + +int hyp_pin_shared_mem(void *from, void *to); +void hyp_unpin_shared_mem(void *from, void *to); +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); +int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, + struct kvm_hyp_memcache *host_mc); + +static __always_inline void __load_host_stage2(void) +{ + if (static_branch_likely(&kvm_protected_mode_initialized)) + __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); + else + write_sysreg(0, vttbr_el2); +} + +#ifdef CONFIG_NVHE_EL2_DEBUG +void pkvm_ownership_selftest(void *base); +#else +static inline void pkvm_ownership_selftest(void *base) { } +#endif +#endif /* __KVM_NVHE_MEM_PROTECT__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h new file mode 100644 index 000000000000..dee1a406b0c2 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_HYP_MEMORY_H +#define __KVM_HYP_MEMORY_H + +#include <asm/kvm_mmu.h> +#include <asm/page.h> + +#include <linux/types.h> + +/* + * Bits 0-1 are used to encode the memory ownership state of each page from the + * point of view of a pKVM "component" (host, hyp, guest, ... see enum + * pkvm_component_id): + * 00: The page is owned and exclusively accessible by the component; + * 01: The page is owned and accessible by the component, but is also + * accessible by another component; + * 10: The page is accessible but not owned by the component; + * The storage of this state depends on the component: either in the + * hyp_vmemmap for the host and hyp states or in PTE software bits for guests. + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = BIT(0), + PKVM_PAGE_SHARED_BORROWED = BIT(1), + + /* + * 'Meta-states' are not stored directly in PTE SW bits for guest + * states, but inferred from the context (e.g. invalid PTE entries). + * For the host and hyp, meta-states are stored directly in the + * struct hyp_page. + */ + PKVM_NOPAGE = BIT(0) | BIT(1), +}; +#define PKVM_PAGE_STATE_MASK (BIT(0) | BIT(1)) + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + prot &= ~PKVM_PAGE_STATE_PROT_MASK; + prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state); + return prot; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot); +} + +struct hyp_page { + u16 refcount; + u8 order; + + /* Host state. Guarded by the host stage-2 lock. */ + unsigned __host_state : 4; + + /* + * Complement of the hyp state. Guarded by the hyp stage-1 lock. We use + * the complement so that the initial 0 in __hyp_state_comp (due to the + * entire vmemmap starting off zeroed) encodes PKVM_NOPAGE. + */ + unsigned __hyp_state_comp : 4; + + u32 host_share_guest_count; +}; + +extern u64 __hyp_vmemmap; +#define hyp_vmemmap ((struct hyp_page *)__hyp_vmemmap) + +#define __hyp_va(phys) ((void *)((phys_addr_t)(phys) - hyp_physvirt_offset)) + +static inline void *hyp_phys_to_virt(phys_addr_t phys) +{ + return __hyp_va(phys); +} + +static inline phys_addr_t hyp_virt_to_phys(void *addr) +{ + return __hyp_pa(addr); +} + +#define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) +#define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) + +static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys) +{ + BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u64)); + return &hyp_vmemmap[hyp_phys_to_pfn(phys)]; +} + +#define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt)) +#define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt)) + +#define hyp_page_to_pfn(page) ((struct hyp_page *)(page) - hyp_vmemmap) +#define hyp_page_to_phys(page) hyp_pfn_to_phys((hyp_page_to_pfn(page))) +#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) +#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) + +static inline enum pkvm_page_state get_host_state(struct hyp_page *p) +{ + return p->__host_state; +} + +static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__host_state = state; +} + +static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p) +{ + return p->__hyp_state_comp ^ PKVM_PAGE_STATE_MASK; +} + +static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_MASK; +} + +/* + * Refcounting for 'struct hyp_page'. + * hyp_pool::lock must be held if atomic access to the refcount is required. + */ +static inline int hyp_page_count(void *addr) +{ + struct hyp_page *p = hyp_virt_to_page(addr); + + return p->refcount; +} + +static inline void hyp_page_ref_inc(struct hyp_page *p) +{ + BUG_ON(p->refcount == USHRT_MAX); + p->refcount++; +} + +static inline void hyp_page_ref_dec(struct hyp_page *p) +{ + BUG_ON(!p->refcount); + p->refcount--; +} + +static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) +{ + hyp_page_ref_dec(p); + return (p->refcount == 0); +} + +static inline void hyp_set_page_refcounted(struct hyp_page *p) +{ + BUG_ON(p->refcount); + p->refcount = 1; +} +#endif /* __KVM_HYP_MEMORY_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h new file mode 100644 index 000000000000..6e83ce35c2f2 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_HYP_MM_H +#define __KVM_HYP_MM_H + +#include <asm/kvm_pgtable.h> +#include <asm/spectre.h> +#include <linux/memblock.h> +#include <linux/types.h> + +#include <nvhe/memory.h> +#include <nvhe/spinlock.h> + +extern struct kvm_pgtable pkvm_pgtable; +extern hyp_spinlock_t pkvm_pgd_lock; + +int hyp_create_fixmap(void); +void *hyp_fixmap_map(phys_addr_t phys); +void hyp_fixmap_unmap(void); +void *hyp_fixblock_map(phys_addr_t phys, size_t *size); +void hyp_fixblock_unmap(void); + +int hyp_create_idmap(u32 hyp_va_bits); +int hyp_map_vectors(void); +int hyp_back_vmemmap(phys_addr_t back); +int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); +int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); +int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, + enum kvm_pgtable_prot prot, + unsigned long *haddr); +int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr); +int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr); + +#endif /* __KVM_HYP_MM_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h new file mode 100644 index 000000000000..184ad7a39950 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#ifndef __ARM64_KVM_NVHE_PKVM_H__ +#define __ARM64_KVM_NVHE_PKVM_H__ + +#include <asm/kvm_pkvm.h> + +#include <nvhe/gfp.h> +#include <nvhe/spinlock.h> + +/* + * Holds the relevant data for maintaining the vcpu state completely at hyp. + */ +struct pkvm_hyp_vcpu { + struct kvm_vcpu vcpu; + + /* Backpointer to the host's (untrusted) vCPU instance. */ + struct kvm_vcpu *host_vcpu; + + /* + * If this hyp vCPU is loaded, then this is a backpointer to the + * per-cpu pointer tracking us. Otherwise, NULL if not loaded. + */ + struct pkvm_hyp_vcpu **loaded_hyp_vcpu; +}; + +/* + * Holds the relevant data for running a vm in protected mode. + */ +struct pkvm_hyp_vm { + struct kvm kvm; + + /* Backpointer to the host's (untrusted) KVM instance. */ + struct kvm *host_kvm; + + /* The guest's stage-2 page-table managed by the hypervisor. */ + struct kvm_pgtable pgt; + struct kvm_pgtable_mm_ops mm_ops; + struct hyp_pool pool; + hyp_spinlock_t lock; + + /* Array of the hyp vCPU structures for this VM. */ + struct pkvm_hyp_vcpu *vcpus[]; +}; + +extern hyp_spinlock_t vm_table_lock; + +static inline struct pkvm_hyp_vm * +pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm); +} + +static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + return vcpu_is_protected(&hyp_vcpu->vcpu); +} + +static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm) +{ + return kvm_vm_is_protected(&hyp_vm->kvm); +} + +void pkvm_hyp_vm_table_init(void *tbl); + +int __pkvm_reserve_vm(void); +void __pkvm_unreserve_vm(pkvm_handle_t handle); +int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, + unsigned long pgd_hva); +int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, + unsigned long vcpu_hva); +int __pkvm_teardown_vm(pkvm_handle_t handle); + +struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, + unsigned int vcpu_idx); +void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void); + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle); +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); + +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); +int kvm_check_pvm_sysreg_table(void); + +#endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h new file mode 100644 index 000000000000..7c7ea8c55405 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * A stand-alone ticket spinlock implementation for use by the non-VHE + * KVM hypervisor code running at EL2. + * + * Copyright (C) 2020 Google LLC + * Author: Will Deacon <will@kernel.org> + * + * Heavily based on the implementation removed by c11090474d70 which was: + * Copyright (C) 2012 ARM Ltd. + */ + +#ifndef __ARM64_KVM_NVHE_SPINLOCK_H__ +#define __ARM64_KVM_NVHE_SPINLOCK_H__ + +#include <asm/alternative.h> +#include <asm/lse.h> +#include <asm/rwonce.h> + +typedef union hyp_spinlock { + u32 __val; + struct { +#ifdef __AARCH64EB__ + u16 next, owner; +#else + u16 owner, next; +#endif + }; +} hyp_spinlock_t; + +#define __HYP_SPIN_LOCK_INITIALIZER \ + { .__val = 0 } + +#define __HYP_SPIN_LOCK_UNLOCKED \ + ((hyp_spinlock_t) __HYP_SPIN_LOCK_INITIALIZER) + +#define DEFINE_HYP_SPINLOCK(x) hyp_spinlock_t x = __HYP_SPIN_LOCK_UNLOCKED + +#define hyp_spin_lock_init(l) \ +do { \ + *(l) = __HYP_SPIN_LOCK_UNLOCKED; \ +} while (0) + +static inline void hyp_spin_lock(hyp_spinlock_t *lock) +{ + u32 tmp; + hyp_spinlock_t lockval, newval; + + asm volatile( + /* Atomically increment the next ticket. */ + ARM64_LSE_ATOMIC_INSN( + /* LL/SC */ +" prfm pstl1strm, %3\n" +"1: ldaxr %w0, %3\n" +" add %w1, %w0, #(1 << 16)\n" +" stxr %w2, %w1, %3\n" +" cbnz %w2, 1b\n", + /* LSE atomics */ +" mov %w2, #(1 << 16)\n" +" ldadda %w2, %w0, %3\n" + __nops(3)) + + /* Did we get the lock? */ +" eor %w1, %w0, %w0, ror #16\n" +" cbz %w1, 3f\n" + /* + * No: spin on the owner. Send a local event to avoid missing an + * unlock before the exclusive load. + */ +" sevl\n" +"2: wfe\n" +" ldaxrh %w2, %4\n" +" eor %w1, %w2, %w0, lsr #16\n" +" cbnz %w1, 2b\n" + /* We got the lock. Critical section starts here. */ +"3:" + : "=&r" (lockval), "=&r" (newval), "=&r" (tmp), "+Q" (*lock) + : "Q" (lock->owner) + : "memory"); +} + +static inline void hyp_spin_unlock(hyp_spinlock_t *lock) +{ + u64 tmp; + + asm volatile( + ARM64_LSE_ATOMIC_INSN( + /* LL/SC */ + " ldrh %w1, %0\n" + " add %w1, %w1, #1\n" + " stlrh %w1, %0", + /* LSE atomics */ + " mov %w1, #1\n" + " staddlh %w1, %0\n" + __nops(1)) + : "=Q" (lock->owner), "=&r" (tmp) + : + : "memory"); +} + +static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock) +{ + hyp_spinlock_t lockval = READ_ONCE(*lock); + + return lockval.owner != lockval.next; +} + +#ifdef CONFIG_NVHE_EL2_DEBUG +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) +{ + /* + * The __pkvm_init() path accesses protected data-structures without + * holding locks as the other CPUs are guaranteed to not enter EL2 + * concurrently at this point in time. The point by which EL2 is + * initialized on all CPUs is reflected in the pkvm static key, so + * wait until it is set before checking the lock state. + */ + if (static_branch_likely(&kvm_protected_mode_initialized)) + BUG_ON(!hyp_spin_is_locked(lock)); +} +#else +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { } +#endif + +#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h new file mode 100644 index 000000000000..ba5382c12787 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Trap handler helpers. + * + * Copyright (C) 2020 - Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#ifndef __ARM64_KVM_NVHE_TRAP_HANDLER_H__ +#define __ARM64_KVM_NVHE_TRAP_HANDLER_H__ + +#include <asm/kvm_host.h> + +#define cpu_reg(ctxt, r) (ctxt)->regs.regs[r] +#define DECLARE_REG(type, name, ctxt, reg) \ + __always_unused int ___check_reg_ ## reg; \ + type name = (type)cpu_reg(ctxt, (reg)) + +#endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore new file mode 100644 index 000000000000..5b6c43cc96f8 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +gen-hyprel +hyp.lds +hyp-reloc.S diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile new file mode 100644 index 000000000000..a244ec25f8c5 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part +# + +asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS + +# Tracepoint and MMIO logging symbols should not be visible at nVHE KVM as +# there is no way to execute them and any such MMIO access from nVHE KVM +# will explode instantly (Words of Marc Zyngier). So introduce a generic flag +# __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM. +ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__ +ccflags-y += -fno-stack-protector \ + -DDISABLE_BRANCH_PROFILING \ + $(DISABLE_KSTACK_ERASE) + +hostprogs := gen-hyprel +HOST_EXTRACFLAGS += -I$(objtree)/include + +lib-objs := clear_page.o copy_page.o memcpy.o memset.o +lib-objs := $(addprefix ../../../lib/, $(lib-objs)) + +CFLAGS_switch.nvhe.o += -Wno-override-init + +hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ + hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o +hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ + ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o +hyp-obj-y += ../../../kernel/smccc-call.o +hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o +hyp-obj-y += $(lib-objs) + +## +## Build rules for compiling nVHE hyp code +## Output of this folder is `kvm_nvhe.o`, a partially linked object +## file containing all nVHE hyp code and data. +## + +hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y)) +obj-y := kvm_nvhe.o +targets += $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o + +# 1) Compile all source files to `.nvhe.o` object files. The file extension +# avoids file name clashes for files shared with VHE. +$(obj)/%.nvhe.o: $(src)/%.c FORCE + $(call if_changed_rule,cc_o_c) +$(obj)/%.nvhe.o: $(src)/%.S FORCE + $(call if_changed_rule,as_o_S) + +# 2) Compile linker script. +$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE + $(call if_changed_dep,cpp_lds_S) + +# 3) Partially link all '.nvhe.o' files and apply the linker script. +# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'. +# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before +# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to +# keep the dependency on the target while avoiding an error from +# GNU ld if the linker script is passed to it twice. +LDFLAGS_kvm_nvhe.tmp.o := -r -T +$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE + $(call if_changed,ld) + +# 4) Generate list of hyp code/data positions that need to be relocated at +# runtime. Because the hypervisor is part of the kernel binary, relocations +# produce a kernel VA. We enumerate relocations targeting hyp at build time +# and convert the kernel VAs at those positions to hyp VAs. +$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel FORCE + $(call if_changed,hyprel) + +# 5) Compile hyp-reloc.S and link it into the existing partially linked object. +# The object file now contains a section with pointers to hyp positions that +# will contain kernel VAs at runtime. These pointers have relocations on them +# so that they get updated as the hyp object is linked into `vmlinux`. +LDFLAGS_kvm_nvhe.rel.o := -r +$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE + $(call if_changed,ld) + +# 6) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'. +# Prefixes names of ELF symbols with '__kvm_nvhe_'. +$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE + $(call if_changed,hypcopy) + +# The HYPREL command calls `gen-hyprel` to generate an assembly file with +# a list of relocations targeting hyp code/data. +quiet_cmd_hyprel = HYPREL $@ + cmd_hyprel = $(obj)/gen-hyprel $< > $@ + +# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names +# to avoid clashes with VHE code/data. +quiet_cmd_hypcopy = HYPCOPY $@ + cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@ + +# Remove ftrace and Shadow Call Stack CFLAGS. +# This is equivalent to the 'notrace' and '__noscs' annotations. +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) +# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile' +# when profile optimization is applied. gen-hyprel does not support SHT_REL and +# causes a build failure. Remove profile optimization flags. +KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables + +ifeq ($(CONFIG_UBSAN_KVM_EL2),y) +UBSAN_SANITIZE := y +# Always use brk and not hooks +ccflags-y += $(CFLAGS_UBSAN_TRAP) +endif diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S new file mode 100644 index 000000000000..85936c17ae40 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/cache.S @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Code copied from arch/arm64/mm/cache.S. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/alternative.h> + +SYM_FUNC_START(__pi_dcache_clean_inval_poc) + dcache_by_line_op civac, sy, x0, x1, x2, x3 + ret +SYM_FUNC_END(__pi_dcache_clean_inval_poc) +SYM_FUNC_ALIAS(dcache_clean_inval_poc, __pi_dcache_clean_inval_poc) + +SYM_FUNC_START(__pi_icache_inval_pou) +alternative_if ARM64_HAS_CACHE_DIC + isb + ret +alternative_else_nop_endif + + invalidate_icache_by_line x0, x1, x2, x3 + ret +SYM_FUNC_END(__pi_icache_inval_pou) +SYM_FUNC_ALIAS(icache_inval_pou, __pi_icache_inval_pou) diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c new file mode 100644 index 000000000000..2a1c0f49792b --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <hyp/debug-sr.h> + +#include <linux/compiler.h> +#include <linux/kvm_host.h> + +#include <asm/debug-monitors.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +static void __debug_save_spe(u64 *pmscr_el1) +{ + u64 reg; + + /* Clear pmscr in case of early return */ + *pmscr_el1 = 0; + + /* + * At this point, we know that this CPU implements + * SPE and is available to the host. + * Check if the host is actually using it ? + */ + reg = read_sysreg_s(SYS_PMBLIMITR_EL1); + if (!(reg & BIT(PMBLIMITR_EL1_E_SHIFT))) + return; + + /* Yes; save the control register and disable data generation */ + *pmscr_el1 = read_sysreg_el1(SYS_PMSCR); + write_sysreg_el1(0, SYS_PMSCR); + isb(); + + /* Now drain all buffered data to memory */ + psb_csync(); +} + +static void __debug_restore_spe(u64 pmscr_el1) +{ + if (!pmscr_el1) + return; + + /* The host page table is installed, but not yet synchronised */ + isb(); + + /* Re-enable data generation */ + write_sysreg_el1(pmscr_el1, SYS_PMSCR); +} + +static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) +{ + *saved_trfcr = read_sysreg_el1(SYS_TRFCR); + write_sysreg_el1(new_trfcr, SYS_TRFCR); +} + +static bool __trace_needs_drain(void) +{ + if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE)) + return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E; + + return host_data_test_flag(TRBE_ENABLED); +} + +static bool __trace_needs_switch(void) +{ + return host_data_test_flag(TRBE_ENABLED) || + host_data_test_flag(EL1_TRACING_CONFIGURED); +} + +static void __trace_switch_to_guest(void) +{ + /* Unsupported with TRBE so disable */ + if (host_data_test_flag(TRBE_ENABLED)) + *host_data_ptr(trfcr_while_in_guest) = 0; + + __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1), + *host_data_ptr(trfcr_while_in_guest)); + + if (__trace_needs_drain()) { + isb(); + tsb_csync(); + } +} + +static void __trace_switch_to_host(void) +{ + __trace_do_switch(host_data_ptr(trfcr_while_in_guest), + *host_data_ptr(host_debug_state.trfcr_el1)); +} + +static void __debug_save_brbe(u64 *brbcr_el1) +{ + *brbcr_el1 = 0; + + /* Check if the BRBE is enabled */ + if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE))) + return; + + /* + * Prohibit branch record generation while we are in guest. + * Since access to BRBCR_EL1 is trapped, the guest can't + * modify the filtering set by the host. + */ + *brbcr_el1 = read_sysreg_el1(SYS_BRBCR); + write_sysreg_el1(0, SYS_BRBCR); +} + +static void __debug_restore_brbe(u64 brbcr_el1) +{ + if (!brbcr_el1) + return; + + /* Restore BRBE controls */ + write_sysreg_el1(brbcr_el1, SYS_BRBCR); +} + +void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) +{ + /* Disable and flush SPE data generation */ + if (host_data_test_flag(HAS_SPE)) + __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1)); + + /* Disable BRBE branch records */ + if (host_data_test_flag(HAS_BRBE)) + __debug_save_brbe(host_data_ptr(host_debug_state.brbcr_el1)); + + if (__trace_needs_switch()) + __trace_switch_to_guest(); +} + +void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +{ + __debug_switch_to_guest_common(vcpu); +} + +void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) +{ + if (host_data_test_flag(HAS_SPE)) + __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1)); + if (host_data_test_flag(HAS_BRBE)) + __debug_restore_brbe(*host_data_ptr(host_debug_state.brbcr_el1)); + if (__trace_needs_switch()) + __trace_switch_to_host(); +} + +void __debug_switch_to_host(struct kvm_vcpu *vcpu) +{ + __debug_switch_to_host_common(vcpu); +} diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c new file mode 100644 index 000000000000..00de04153cc6 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/early_alloc.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <asm/kvm_pgtable.h> + +#include <nvhe/early_alloc.h> +#include <nvhe/memory.h> + +struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops; +s64 __ro_after_init hyp_physvirt_offset; + +static unsigned long base; +static unsigned long end; +static unsigned long cur; + +unsigned long hyp_early_alloc_nr_used_pages(void) +{ + return (cur - base) >> PAGE_SHIFT; +} + +void *hyp_early_alloc_contig(unsigned int nr_pages) +{ + unsigned long size = (nr_pages << PAGE_SHIFT); + void *ret = (void *)cur; + + if (!nr_pages) + return NULL; + + if (end - cur < size) + return NULL; + + cur += size; + memset(ret, 0, size); + + return ret; +} + +void *hyp_early_alloc_page(void *arg) +{ + return hyp_early_alloc_contig(1); +} + +static void hyp_early_alloc_get_page(void *addr) { } +static void hyp_early_alloc_put_page(void *addr) { } + +void hyp_early_alloc_init(void *virt, unsigned long size) +{ + base = cur = (unsigned long)virt; + end = base + size; + + hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page; + hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt; + hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys; + hyp_early_alloc_mm_ops.get_page = hyp_early_alloc_get_page; + hyp_early_alloc_mm_ops.put_page = hyp_early_alloc_put_page; +} diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c new file mode 100644 index 000000000000..f731cc4c3f28 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -0,0 +1,993 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by + * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware + * Framework for Arm A-profile", which is specified by Arm in document + * number DEN0077. + * + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran <qwandor@google.com> + * + * This driver hooks into the SMC trapping logic for the host and intercepts + * all calls falling within the FF-A range. Each call is either: + * + * - Forwarded on unmodified to the SPMD at EL3 + * - Rejected as "unsupported" + * - Accompanied by a host stage-2 page-table check/update and reissued + * + * Consequently, any attempts by the host to make guest memory pages + * accessible to the secure world using FF-A will be detected either here + * (in the case that the memory is already owned by the guest) or during + * donation to the guest (in the case that the memory was previously shared + * with the secure world). + * + * To allow the rolling-back of page-table updates and FF-A calls in the + * event of failure, operations involving the RXTX buffers are locked for + * the duration and are therefore serialised. + */ + +#include <linux/arm-smccc.h> +#include <linux/arm_ffa.h> +#include <asm/kvm_pkvm.h> + +#include <nvhe/ffa.h> +#include <nvhe/mem_protect.h> +#include <nvhe/memory.h> +#include <nvhe/trap_handler.h> +#include <nvhe/spinlock.h> + +/* + * "ID value 0 must be returned at the Non-secure physical FF-A instance" + * We share this ID with the host. + */ +#define HOST_FFA_ID 0 + +/* + * A buffer to hold the maximum descriptor size we can see from the host, + * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP + * when resolving the handle on the reclaim path. + */ +struct kvm_ffa_descriptor_buffer { + void *buf; + size_t len; +}; + +static struct kvm_ffa_descriptor_buffer ffa_desc_buf; + +struct kvm_ffa_buffers { + hyp_spinlock_t lock; + void *tx; + void *rx; +}; + +/* + * Note that we don't currently lock these buffers explicitly, instead + * relying on the locking of the host FFA buffers as we only have one + * client. + */ +static struct kvm_ffa_buffers hyp_buffers; +static struct kvm_ffa_buffers host_buffers; +static u32 hyp_ffa_version; +static bool has_version_negotiated; +static hyp_spinlock_t version_lock; + +static void ffa_to_smccc_error(struct arm_smccc_1_2_regs *res, u64 ffa_errno) +{ + *res = (struct arm_smccc_1_2_regs) { + .a0 = FFA_ERROR, + .a2 = ffa_errno, + }; +} + +static void ffa_to_smccc_res_prop(struct arm_smccc_1_2_regs *res, int ret, u64 prop) +{ + if (ret == FFA_RET_SUCCESS) { + *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_SUCCESS, + .a2 = prop }; + } else { + ffa_to_smccc_error(res, ret); + } +} + +static void ffa_to_smccc_res(struct arm_smccc_1_2_regs *res, int ret) +{ + ffa_to_smccc_res_prop(res, ret, 0); +} + +static void ffa_set_retval(struct kvm_cpu_context *ctxt, + struct arm_smccc_1_2_regs *res) +{ + cpu_reg(ctxt, 0) = res->a0; + cpu_reg(ctxt, 1) = res->a1; + cpu_reg(ctxt, 2) = res->a2; + cpu_reg(ctxt, 3) = res->a3; + cpu_reg(ctxt, 4) = res->a4; + cpu_reg(ctxt, 5) = res->a5; + cpu_reg(ctxt, 6) = res->a6; + cpu_reg(ctxt, 7) = res->a7; + + /* + * DEN0028C 2.6: SMC32/HVC32 call from aarch64 must preserve x8-x30. + * + * In FF-A 1.2, we cannot rely on the function ID sent by the caller to + * detect 32-bit calls because the CPU cycle management interfaces (e.g. + * FFA_MSG_WAIT, FFA_RUN) are 32-bit only but can have 64-bit responses. + * + * FFA-1.3 introduces 64-bit variants of the CPU cycle management + * interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests + * complete with SMC32 direct responses which *should* allow us use the + * function ID sent by the caller to determine whether to return x8-x17. + * + * Note that we also cannot rely on function IDs in the response. + * + * Given the above, assume SMC64 and send back x0-x17 unconditionally + * as the passthrough code (__kvm_hyp_host_forward_smc) does the same. + */ + cpu_reg(ctxt, 8) = res->a8; + cpu_reg(ctxt, 9) = res->a9; + cpu_reg(ctxt, 10) = res->a10; + cpu_reg(ctxt, 11) = res->a11; + cpu_reg(ctxt, 12) = res->a12; + cpu_reg(ctxt, 13) = res->a13; + cpu_reg(ctxt, 14) = res->a14; + cpu_reg(ctxt, 15) = res->a15; + cpu_reg(ctxt, 16) = res->a16; + cpu_reg(ctxt, 17) = res->a17; +} + +static bool is_ffa_call(u64 func_id) +{ + return ARM_SMCCC_IS_FAST_CALL(func_id) && + ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD && + ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM && + ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM; +} + +static int ffa_map_hyp_buffers(u64 ffa_page_count) +{ + struct arm_smccc_1_2_regs res; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_RXTX_MAP, + .a1 = hyp_virt_to_phys(hyp_buffers.tx), + .a2 = hyp_virt_to_phys(hyp_buffers.rx), + .a3 = ffa_page_count, + }, &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static int ffa_unmap_hyp_buffers(void) +{ + struct arm_smccc_1_2_regs res; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RXTX_UNMAP, + .a1 = HOST_FFA_ID, + }, &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo, + u32 handle_hi, u32 fraglen, u32 endpoint_id) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_TX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fraglen, + .a4 = endpoint_id, + }, res); +} + +static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo, + u32 handle_hi, u32 fragoff) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_RX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fragoff, + .a4 = HOST_FFA_ID, + }, res); +} + +static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len, + u32 fraglen) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = func_id, + .a1 = len, + .a2 = fraglen, + }, res); +} + +static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo, + u32 handle_hi, u32 flags) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_RECLAIM, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = flags, + }, res); +} + +static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_MEM_RETRIEVE_REQ, + .a1 = len, + .a2 = len, + }, res); +} + +static void ffa_rx_release(struct arm_smccc_1_2_regs *res) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RX_RELEASE, + }, res); +} + +static void do_ffa_rxtx_map(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(phys_addr_t, tx, ctxt, 1); + DECLARE_REG(phys_addr_t, rx, ctxt, 2); + DECLARE_REG(u32, npages, ctxt, 3); + int ret = 0; + void *rx_virt, *tx_virt; + + if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (host_buffers.tx) { + ret = FFA_RET_DENIED; + goto out_unlock; + } + + /* + * Map our hypervisor buffers into the SPMD before mapping and + * pinning the host buffers in our own address space. + */ + ret = ffa_map_hyp_buffers(npages); + if (ret) + goto out_unlock; + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unmap; + } + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_tx; + } + + tx_virt = hyp_phys_to_virt(tx); + ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_rx; + } + + rx_virt = hyp_phys_to_virt(rx); + ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unpin_tx; + } + + host_buffers.tx = tx_virt; + host_buffers.rx = rx_virt; + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); + return; + +err_unpin_tx: + hyp_unpin_shared_mem(tx_virt, tx_virt + 1); +err_unshare_rx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx)); +err_unshare_tx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx)); +err_unmap: + ffa_unmap_hyp_buffers(); + goto out_unlock; +} + +static void do_ffa_rxtx_unmap(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + int ret = 0; + + if (id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx))); + host_buffers.tx = NULL; + + hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx))); + host_buffers.rx = NULL; + + ffa_unmap_hyp_buffers(); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); +} + +static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nshared = __ffa_host_share_ranges(ranges, nranges); + int ret = 0; + + if (nshared != nranges) { + WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges); + int ret = 0; + + if (nunshared != nranges) { + WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static void do_ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, fraglen, ctxt, 3); + DECLARE_REG(u32, endpoint_id, ctxt, 4); + struct ffa_mem_region_addr_range *buf; + int ret = FFA_RET_INVALID_PARAMETERS; + u32 nr_ranges; + + if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) + goto out; + + if (fraglen % sizeof(*buf)) + goto out; + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) + goto out_unlock; + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + nr_ranges = fraglen / sizeof(*buf); + + ret = ffa_host_share_ranges(buf, nr_ranges); + if (ret) { + /* + * We're effectively aborting the transaction, so we need + * to restore the global state back to what it was prior to + * transmission of the first fragment. + */ + ffa_mem_reclaim(res, handle_lo, handle_hi, 0); + WARN_ON(res->a0 != FFA_SUCCESS); + goto out_unlock; + } + + ffa_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id); + if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX) + WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges)); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + + /* + * If for any reason this did not succeed, we're in trouble as we have + * now lost the content of the previous fragments and we can't rollback + * the host stage-2 changes. The pages previously marked as shared will + * remain stuck in that state forever, hence preventing the host from + * sharing/donating them again and may possibly lead to subsequent + * failures, but this will not compromise confidentiality. + */ + return; +} + +static void __do_ffa_mem_xfer(const u64 func_id, + struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, len, ctxt, 1); + DECLARE_REG(u32, fraglen, ctxt, 2); + DECLARE_REG(u64, addr_mbz, ctxt, 3); + DECLARE_REG(u32, npages_mbz, ctxt, 4); + struct ffa_mem_region_attributes *ep_mem_access; + struct ffa_composite_mem_region *reg; + struct ffa_mem_region *buf; + u32 offset, nr_ranges, checked_offset; + int ret = 0; + + if (addr_mbz || npages_mbz || fraglen > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (fraglen < sizeof(struct ffa_mem_region) + + sizeof(struct ffa_mem_region_attributes)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + goto out_unlock; + } + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + + ep_mem_access = (void *)buf + + ffa_mem_desc_offset(buf, 0, hyp_ffa_version); + offset = ep_mem_access->composite_off; + if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (fraglen < checked_offset) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + reg = (void *)buf + offset; + nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents; + if (nr_ranges % sizeof(reg->constituents[0])) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + nr_ranges /= sizeof(reg->constituents[0]); + ret = ffa_host_share_ranges(reg->constituents, nr_ranges); + if (ret) + goto out_unlock; + + ffa_mem_xfer(res, func_id, len, fraglen); + if (fraglen != len) { + if (res->a0 != FFA_MEM_FRAG_RX) + goto err_unshare; + + if (res->a3 != fraglen) + goto err_unshare; + } else if (res->a0 != FFA_SUCCESS) { + goto err_unshare; + } + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + return; + +err_unshare: + WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges)); + goto out_unlock; +} + +#define do_ffa_mem_xfer(fid, res, ctxt) \ + do { \ + BUILD_BUG_ON((fid) != FFA_FN64_MEM_SHARE && \ + (fid) != FFA_FN64_MEM_LEND); \ + __do_ffa_mem_xfer((fid), (res), (ctxt)); \ + } while (0); + +static void do_ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, flags, ctxt, 3); + struct ffa_mem_region_attributes *ep_mem_access; + struct ffa_composite_mem_region *reg; + u32 offset, len, fraglen, fragoff; + struct ffa_mem_region *buf; + int ret = 0; + u64 handle; + + handle = PACK_HANDLE(handle_lo, handle_hi); + + hyp_spin_lock(&host_buffers.lock); + + buf = hyp_buffers.tx; + *buf = (struct ffa_mem_region) { + .sender_id = HOST_FFA_ID, + .handle = handle, + }; + + ffa_retrieve_req(res, sizeof(*buf)); + buf = hyp_buffers.rx; + if (res->a0 != FFA_MEM_RETRIEVE_RESP) + goto out_unlock; + + len = res->a1; + fraglen = res->a2; + + ep_mem_access = (void *)buf + + ffa_mem_desc_offset(buf, 0, hyp_ffa_version); + offset = ep_mem_access->composite_off; + /* + * We can trust the SPMD to get this right, but let's at least + * check that we end up with something that doesn't look _completely_ + * bogus. + */ + if (WARN_ON(offset > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { + ret = FFA_RET_ABORTED; + ffa_rx_release(res); + goto out_unlock; + } + + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + ffa_rx_release(res); + goto out_unlock; + } + + buf = ffa_desc_buf.buf; + memcpy(buf, hyp_buffers.rx, fraglen); + ffa_rx_release(res); + + for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { + ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff); + if (res->a0 != FFA_MEM_FRAG_TX) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + fraglen = res->a3; + memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); + ffa_rx_release(res); + } + + ffa_mem_reclaim(res, handle_lo, handle_hi, flags); + if (res->a0 != FFA_SUCCESS) + goto out_unlock; + + reg = (void *)buf + offset; + /* If the SPMD was happy, then we should be too. */ + WARN_ON(ffa_host_unshare_ranges(reg->constituents, + reg->addr_range_cnt)); +out_unlock: + hyp_spin_unlock(&host_buffers.lock); + + if (ret) + ffa_to_smccc_res(res, ret); +} + +/* + * Is a given FFA function supported, either by forwarding on directly + * or by handling at EL2? + */ +static bool ffa_call_supported(u64 func_id) +{ + switch (func_id) { + /* Unsupported memory management calls */ + case FFA_FN64_MEM_RETRIEVE_REQ: + case FFA_MEM_RETRIEVE_RESP: + case FFA_MEM_RELINQUISH: + case FFA_MEM_OP_PAUSE: + case FFA_MEM_OP_RESUME: + case FFA_MEM_FRAG_RX: + case FFA_FN64_MEM_DONATE: + /* Indirect message passing via RX/TX buffers */ + case FFA_MSG_SEND: + case FFA_MSG_POLL: + case FFA_MSG_WAIT: + /* 32-bit variants of 64-bit calls */ + case FFA_MSG_SEND_DIRECT_RESP: + case FFA_RXTX_MAP: + case FFA_MEM_DONATE: + case FFA_MEM_RETRIEVE_REQ: + /* Optional notification interfaces added in FF-A 1.1 */ + case FFA_NOTIFICATION_BITMAP_CREATE: + case FFA_NOTIFICATION_BITMAP_DESTROY: + case FFA_NOTIFICATION_BIND: + case FFA_NOTIFICATION_UNBIND: + case FFA_NOTIFICATION_SET: + case FFA_NOTIFICATION_GET: + case FFA_NOTIFICATION_INFO_GET: + /* Optional interfaces added in FF-A 1.2 */ + case FFA_MSG_SEND_DIRECT_REQ2: /* Optional per 7.5.1 */ + case FFA_MSG_SEND_DIRECT_RESP2: /* Optional per 7.5.1 */ + case FFA_CONSOLE_LOG: /* Optional per 13.1: not in Table 13.1 */ + case FFA_PARTITION_INFO_GET_REGS: /* Optional for virtual instances per 13.1 */ + return false; + } + + return true; +} + +static bool do_ffa_features(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + u64 prop = 0; + int ret = 0; + + if (!ffa_call_supported(id)) { + ret = FFA_RET_NOT_SUPPORTED; + goto out_handled; + } + + switch (id) { + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + ret = FFA_RET_SUCCESS; + prop = 0; /* No support for dynamic buffers */ + goto out_handled; + default: + return false; + } + +out_handled: + ffa_to_smccc_res_prop(res, ret, prop); + return true; +} + +static int hyp_ffa_post_init(void) +{ + size_t min_rxtx_sz; + struct arm_smccc_1_2_regs res; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_ID_GET, + }, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + if (res.a2 != HOST_FFA_ID) + return -EINVAL; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_FEATURES, + .a1 = FFA_FN64_RXTX_MAP, + }, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) { + case FFA_FEAT_RXTX_MIN_SZ_4K: + min_rxtx_sz = SZ_4K; + break; + case FFA_FEAT_RXTX_MIN_SZ_16K: + min_rxtx_sz = SZ_16K; + break; + case FFA_FEAT_RXTX_MIN_SZ_64K: + min_rxtx_sz = SZ_64K; + break; + default: + return -EINVAL; + } + + if (min_rxtx_sz > PAGE_SIZE) + return -EOPNOTSUPP; + + return 0; +} + +static void do_ffa_version(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, ffa_req_version, ctxt, 1); + + if (FFA_MAJOR_VERSION(ffa_req_version) != 1) { + res->a0 = FFA_RET_NOT_SUPPORTED; + return; + } + + hyp_spin_lock(&version_lock); + if (has_version_negotiated) { + if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) + res->a0 = FFA_RET_NOT_SUPPORTED; + else + res->a0 = hyp_ffa_version; + goto unlock; + } + + /* + * If the client driver tries to downgrade the version, we need to ask + * first if TEE supports it. + */ + if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) { + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = ffa_req_version, + }, res); + if (res->a0 == FFA_RET_NOT_SUPPORTED) + goto unlock; + + hyp_ffa_version = ffa_req_version; + } + + if (hyp_ffa_post_init()) { + res->a0 = FFA_RET_NOT_SUPPORTED; + } else { + smp_store_release(&has_version_negotiated, true); + res->a0 = hyp_ffa_version; + } +unlock: + hyp_spin_unlock(&version_lock); +} + +static void do_ffa_part_get(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, uuid0, ctxt, 1); + DECLARE_REG(u32, uuid1, ctxt, 2); + DECLARE_REG(u32, uuid2, ctxt, 3); + DECLARE_REG(u32, uuid3, ctxt, 4); + DECLARE_REG(u32, flags, ctxt, 5); + u32 count, partition_sz, copy_sz; + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.rx) { + ffa_to_smccc_res(res, FFA_RET_BUSY); + goto out_unlock; + } + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_PARTITION_INFO_GET, + .a1 = uuid0, + .a2 = uuid1, + .a3 = uuid2, + .a4 = uuid3, + .a5 = flags, + }, res); + + if (res->a0 != FFA_SUCCESS) + goto out_unlock; + + count = res->a2; + if (!count) + goto out_unlock; + + if (hyp_ffa_version > FFA_VERSION_1_0) { + /* Get the number of partitions deployed in the system */ + if (flags & 0x1) + goto out_unlock; + + partition_sz = res->a3; + } else { + /* FFA_VERSION_1_0 lacks the size in the response */ + partition_sz = FFA_1_0_PARTITON_INFO_SZ; + } + + copy_sz = partition_sz * count; + if (copy_sz > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { + ffa_to_smccc_res(res, FFA_RET_ABORTED); + goto out_unlock; + } + + memcpy(host_buffers.rx, hyp_buffers.rx, copy_sz); +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +} + +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) +{ + struct arm_smccc_1_2_regs res; + + /* + * There's no way we can tell what a non-standard SMC call might + * be up to. Ideally, we would terminate these here and return + * an error to the host, but sadly devices make use of custom + * firmware calls for things like power management, debugging, + * RNG access and crash reporting. + * + * Given that the architecture requires us to trust EL3 anyway, + * we forward unrecognised calls on under the assumption that + * the firmware doesn't expose a mechanism to access arbitrary + * non-secure memory. Short of a per-device table of SMCs, this + * is the best we can do. + */ + if (!is_ffa_call(func_id)) + return false; + + if (func_id != FFA_VERSION && + !smp_load_acquire(&has_version_negotiated)) { + ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS); + goto out_handled; + } + + switch (func_id) { + case FFA_FEATURES: + if (!do_ffa_features(&res, host_ctxt)) + return false; + goto out_handled; + /* Memory management */ + case FFA_FN64_RXTX_MAP: + do_ffa_rxtx_map(&res, host_ctxt); + goto out_handled; + case FFA_RXTX_UNMAP: + do_ffa_rxtx_unmap(&res, host_ctxt); + goto out_handled; + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt); + goto out_handled; + case FFA_MEM_RECLAIM: + do_ffa_mem_reclaim(&res, host_ctxt); + goto out_handled; + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt); + goto out_handled; + case FFA_MEM_FRAG_TX: + do_ffa_mem_frag_tx(&res, host_ctxt); + goto out_handled; + case FFA_VERSION: + do_ffa_version(&res, host_ctxt); + goto out_handled; + case FFA_PARTITION_INFO_GET: + do_ffa_part_get(&res, host_ctxt); + goto out_handled; + } + + if (ffa_call_supported(func_id)) + return false; /* Pass through */ + + ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED); +out_handled: + ffa_set_retval(host_ctxt, &res); + return true; +} + +int hyp_ffa_init(void *pages) +{ + struct arm_smccc_1_2_regs res; + void *tx, *rx; + + if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) + return 0; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = FFA_VERSION_1_2, + }, &res); + if (res.a0 == FFA_RET_NOT_SUPPORTED) + return 0; + + /* + * Firmware returns the maximum supported version of the FF-A + * implementation. Check that the returned version is + * backwards-compatible with the hyp according to the rules in DEN0077A + * v1.1 REL0 13.2.1. + * + * Of course, things are never simple when dealing with firmware. v1.1 + * broke ABI with v1.0 on several structures, which is itself + * incompatible with the aforementioned versioning scheme. The + * expectation is that v1.x implementations that do not support the v1.0 + * ABI return NOT_SUPPORTED rather than a version number, according to + * DEN0077A v1.1 REL0 18.6.4. + */ + if (FFA_MAJOR_VERSION(res.a0) != 1) + return -EOPNOTSUPP; + + if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_2)) + hyp_ffa_version = res.a0; + else + hyp_ffa_version = FFA_VERSION_1_2; + + tx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + rx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + + ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) { + .buf = pages, + .len = PAGE_SIZE * + (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)), + }; + + hyp_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + .tx = tx, + .rx = rx, + }; + + host_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + }; + + version_lock = __HYP_SPIN_LOCK_UNLOCKED; + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c new file mode 100644 index 000000000000..b63f4e1c1033 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c @@ -0,0 +1,462 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google LLC + * Author: David Brazdil <dbrazdil@google.com> + * + * Generates relocation information used by the kernel to convert + * absolute addresses in hyp data from kernel VAs to hyp VAs. + * + * This is necessary because hyp code is linked into the same binary + * as the kernel but executes under different memory mappings. + * If the compiler used absolute addressing, those addresses need to + * be converted before they are used by hyp code. + * + * The input of this program is the relocatable ELF object containing + * all hyp code/data, not yet linked into vmlinux. Hyp section names + * should have been prefixed with `.hyp` at this point. + * + * The output (printed to stdout) is an assembly file containing + * an array of 32-bit integers and static relocations that instruct + * the linker of `vmlinux` to populate the array entries with offsets + * to positions in the kernel binary containing VAs used by hyp code. + * + * Note that dynamic relocations could be used for the same purpose. + * However, those are only generated if CONFIG_RELOCATABLE=y. + */ + +#include <elf.h> +#include <endian.h> +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <generated/autoconf.h> + +#define HYP_SECTION_PREFIX ".hyp" +#define HYP_RELOC_SECTION ".hyp.reloc" +#define HYP_SECTION_SYMBOL_PREFIX "__hyp_section_" + +/* + * AArch64 relocation type constants. + * Included in case these are not defined in the host toolchain. + */ +#ifndef R_AARCH64_ABS64 +#define R_AARCH64_ABS64 257 +#endif +#ifndef R_AARCH64_ABS32 +#define R_AARCH64_ABS32 258 +#endif +#ifndef R_AARCH64_PREL64 +#define R_AARCH64_PREL64 260 +#endif +#ifndef R_AARCH64_PREL32 +#define R_AARCH64_PREL32 261 +#endif +#ifndef R_AARCH64_PREL16 +#define R_AARCH64_PREL16 262 +#endif +#ifndef R_AARCH64_PLT32 +#define R_AARCH64_PLT32 314 +#endif +#ifndef R_AARCH64_LD_PREL_LO19 +#define R_AARCH64_LD_PREL_LO19 273 +#endif +#ifndef R_AARCH64_ADR_PREL_LO21 +#define R_AARCH64_ADR_PREL_LO21 274 +#endif +#ifndef R_AARCH64_ADR_PREL_PG_HI21 +#define R_AARCH64_ADR_PREL_PG_HI21 275 +#endif +#ifndef R_AARCH64_ADR_PREL_PG_HI21_NC +#define R_AARCH64_ADR_PREL_PG_HI21_NC 276 +#endif +#ifndef R_AARCH64_ADD_ABS_LO12_NC +#define R_AARCH64_ADD_ABS_LO12_NC 277 +#endif +#ifndef R_AARCH64_LDST8_ABS_LO12_NC +#define R_AARCH64_LDST8_ABS_LO12_NC 278 +#endif +#ifndef R_AARCH64_TSTBR14 +#define R_AARCH64_TSTBR14 279 +#endif +#ifndef R_AARCH64_CONDBR19 +#define R_AARCH64_CONDBR19 280 +#endif +#ifndef R_AARCH64_JUMP26 +#define R_AARCH64_JUMP26 282 +#endif +#ifndef R_AARCH64_CALL26 +#define R_AARCH64_CALL26 283 +#endif +#ifndef R_AARCH64_LDST16_ABS_LO12_NC +#define R_AARCH64_LDST16_ABS_LO12_NC 284 +#endif +#ifndef R_AARCH64_LDST32_ABS_LO12_NC +#define R_AARCH64_LDST32_ABS_LO12_NC 285 +#endif +#ifndef R_AARCH64_LDST64_ABS_LO12_NC +#define R_AARCH64_LDST64_ABS_LO12_NC 286 +#endif +#ifndef R_AARCH64_MOVW_PREL_G0 +#define R_AARCH64_MOVW_PREL_G0 287 +#endif +#ifndef R_AARCH64_MOVW_PREL_G0_NC +#define R_AARCH64_MOVW_PREL_G0_NC 288 +#endif +#ifndef R_AARCH64_MOVW_PREL_G1 +#define R_AARCH64_MOVW_PREL_G1 289 +#endif +#ifndef R_AARCH64_MOVW_PREL_G1_NC +#define R_AARCH64_MOVW_PREL_G1_NC 290 +#endif +#ifndef R_AARCH64_MOVW_PREL_G2 +#define R_AARCH64_MOVW_PREL_G2 291 +#endif +#ifndef R_AARCH64_MOVW_PREL_G2_NC +#define R_AARCH64_MOVW_PREL_G2_NC 292 +#endif +#ifndef R_AARCH64_MOVW_PREL_G3 +#define R_AARCH64_MOVW_PREL_G3 293 +#endif +#ifndef R_AARCH64_LDST128_ABS_LO12_NC +#define R_AARCH64_LDST128_ABS_LO12_NC 299 +#endif + +/* Global state of the processed ELF. */ +static struct { + const char *path; + char *begin; + size_t size; + Elf64_Ehdr *ehdr; + Elf64_Shdr *sh_table; + const char *sh_string; +} elf; + +#if defined(CONFIG_CPU_LITTLE_ENDIAN) + +#define elf16toh(x) le16toh(x) +#define elf32toh(x) le32toh(x) +#define elf64toh(x) le64toh(x) + +#define ELFENDIAN ELFDATA2LSB + +#elif defined(CONFIG_CPU_BIG_ENDIAN) + +#define elf16toh(x) be16toh(x) +#define elf32toh(x) be32toh(x) +#define elf64toh(x) be64toh(x) + +#define ELFENDIAN ELFDATA2MSB + +#else + +#error PDP-endian sadly unsupported... + +#endif + +#define fatal_error(fmt, ...) \ + ({ \ + fprintf(stderr, "error: %s: " fmt "\n", \ + elf.path, ## __VA_ARGS__); \ + exit(EXIT_FAILURE); \ + __builtin_unreachable(); \ + }) + +#define fatal_perror(msg) \ + ({ \ + fprintf(stderr, "error: %s: " msg ": %s\n", \ + elf.path, strerror(errno)); \ + exit(EXIT_FAILURE); \ + __builtin_unreachable(); \ + }) + +#define assert_op(lhs, rhs, fmt, op) \ + ({ \ + typeof(lhs) _lhs = (lhs); \ + typeof(rhs) _rhs = (rhs); \ + \ + if (!(_lhs op _rhs)) { \ + fatal_error("assertion " #lhs " " #op " " #rhs \ + " failed (lhs=" fmt ", rhs=" fmt \ + ", line=%d)", _lhs, _rhs, __LINE__); \ + } \ + }) + +#define assert_eq(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, ==) +#define assert_ne(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, !=) +#define assert_lt(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, <) +#define assert_ge(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, >=) + +/* + * Return a pointer of a given type at a given offset from + * the beginning of the ELF file. + */ +#define elf_ptr(type, off) ((type *)(elf.begin + (off))) + +/* Iterate over all sections in the ELF. */ +#define for_each_section(var) \ + for (var = elf.sh_table; var < elf.sh_table + elf16toh(elf.ehdr->e_shnum); ++var) + +/* Iterate over all Elf64_Rela relocations in a given section. */ +#define for_each_rela(shdr, var) \ + for (var = elf_ptr(Elf64_Rela, elf64toh(shdr->sh_offset)); \ + var < elf_ptr(Elf64_Rela, elf64toh(shdr->sh_offset) + elf64toh(shdr->sh_size)); var++) + +/* True if a string starts with a given prefix. */ +static inline bool starts_with(const char *str, const char *prefix) +{ + return memcmp(str, prefix, strlen(prefix)) == 0; +} + +/* Returns a string containing the name of a given section. */ +static inline const char *section_name(Elf64_Shdr *shdr) +{ + return elf.sh_string + elf32toh(shdr->sh_name); +} + +/* Returns a pointer to the first byte of section data. */ +static inline const char *section_begin(Elf64_Shdr *shdr) +{ + return elf_ptr(char, elf64toh(shdr->sh_offset)); +} + +/* Find a section by its offset from the beginning of the file. */ +static inline Elf64_Shdr *section_by_off(Elf64_Off off) +{ + assert_ne(off, 0UL, "%lu"); + return elf_ptr(Elf64_Shdr, off); +} + +/* Find a section by its index. */ +static inline Elf64_Shdr *section_by_idx(uint16_t idx) +{ + assert_ne(idx, SHN_UNDEF, "%u"); + return &elf.sh_table[idx]; +} + +/* + * Memory-map the given ELF file, perform sanity checks, and + * populate global state. + */ +static void init_elf(const char *path) +{ + int fd, ret; + struct stat stat; + + /* Store path in the global struct for error printing. */ + elf.path = path; + + /* Open the ELF file. */ + fd = open(path, O_RDONLY); + if (fd < 0) + fatal_perror("Could not open ELF file"); + + /* Get status of ELF file to obtain its size. */ + ret = fstat(fd, &stat); + if (ret < 0) { + close(fd); + fatal_perror("Could not get status of ELF file"); + } + + /* mmap() the entire ELF file read-only at an arbitrary address. */ + elf.begin = mmap(0, stat.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (elf.begin == MAP_FAILED) { + close(fd); + fatal_perror("Could not mmap ELF file"); + } + + /* mmap() was successful, close the FD. */ + close(fd); + + /* Get pointer to the ELF header. */ + assert_ge(stat.st_size, sizeof(*elf.ehdr), "%lu"); + elf.ehdr = elf_ptr(Elf64_Ehdr, 0); + + /* Check the ELF magic. */ + assert_eq(elf.ehdr->e_ident[EI_MAG0], ELFMAG0, "0x%x"); + assert_eq(elf.ehdr->e_ident[EI_MAG1], ELFMAG1, "0x%x"); + assert_eq(elf.ehdr->e_ident[EI_MAG2], ELFMAG2, "0x%x"); + assert_eq(elf.ehdr->e_ident[EI_MAG3], ELFMAG3, "0x%x"); + + /* Sanity check that this is an ELF64 relocatable object for AArch64. */ + assert_eq(elf.ehdr->e_ident[EI_CLASS], ELFCLASS64, "%u"); + assert_eq(elf.ehdr->e_ident[EI_DATA], ELFENDIAN, "%u"); + assert_eq(elf16toh(elf.ehdr->e_type), ET_REL, "%u"); + assert_eq(elf16toh(elf.ehdr->e_machine), EM_AARCH64, "%u"); + + /* Populate fields of the global struct. */ + elf.sh_table = section_by_off(elf64toh(elf.ehdr->e_shoff)); + elf.sh_string = section_begin(section_by_idx(elf16toh(elf.ehdr->e_shstrndx))); +} + +/* Print the prologue of the output ASM file. */ +static void emit_prologue(void) +{ + printf(".data\n" + ".pushsection " HYP_RELOC_SECTION ", \"a\"\n"); +} + +/* Print ASM statements needed as a prologue to a processed hyp section. */ +static void emit_section_prologue(const char *sh_orig_name) +{ + /* Declare the hyp section symbol. */ + printf(".global %s%s\n", HYP_SECTION_SYMBOL_PREFIX, sh_orig_name); +} + +/* + * Print ASM statements to create a hyp relocation entry for a given + * R_AARCH64_ABS64 relocation. + * + * The linker of vmlinux will populate the position given by `rela` with + * an absolute 64-bit kernel VA. If the kernel is relocatable, it will + * also generate a dynamic relocation entry so that the kernel can shift + * the address at runtime for KASLR. + * + * Emit a 32-bit offset from the current address to the position given + * by `rela`. This way the kernel can iterate over all kernel VAs used + * by hyp at runtime and convert them to hyp VAs. However, that offset + * will not be known until linking of `vmlinux`, so emit a PREL32 + * relocation referencing a symbol that the hyp linker script put at + * the beginning of the relocated section + the offset from `rela`. + */ +static void emit_rela_abs64(Elf64_Rela *rela, const char *sh_orig_name) +{ + /* Offset of this reloc from the beginning of HYP_RELOC_SECTION. */ + static size_t reloc_offset; + + /* Create storage for the 32-bit offset. */ + printf(".word 0\n"); + + /* + * Create a PREL32 relocation which instructs the linker of `vmlinux` + * to insert offset to position <base> + <offset>, where <base> is + * a symbol at the beginning of the relocated section, and <offset> + * is `rela->r_offset`. + */ + printf(".reloc %lu, R_AARCH64_PREL32, %s%s + 0x%lx\n", + reloc_offset, HYP_SECTION_SYMBOL_PREFIX, sh_orig_name, + elf64toh(rela->r_offset)); + + reloc_offset += 4; +} + +/* Print the epilogue of the output ASM file. */ +static void emit_epilogue(void) +{ + printf(".popsection\n"); +} + +/* + * Iterate over all RELA relocations in a given section and emit + * hyp relocation data for all absolute addresses in hyp code/data. + * + * Static relocations that generate PC-relative-addressing are ignored. + * Failure is reported for unexpected relocation types. + */ +static void emit_rela_section(Elf64_Shdr *sh_rela) +{ + Elf64_Shdr *sh_orig = &elf.sh_table[elf32toh(sh_rela->sh_info)]; + const char *sh_orig_name = section_name(sh_orig); + Elf64_Rela *rela; + + /* Skip all non-hyp sections. */ + if (!starts_with(sh_orig_name, HYP_SECTION_PREFIX)) + return; + + emit_section_prologue(sh_orig_name); + + for_each_rela(sh_rela, rela) { + uint32_t type = (uint32_t)elf64toh(rela->r_info); + + /* Check that rela points inside the relocated section. */ + assert_lt(elf64toh(rela->r_offset), elf64toh(sh_orig->sh_size), "0x%lx"); + + switch (type) { + /* + * Data relocations to generate absolute addressing. + * Emit a hyp relocation. + */ + case R_AARCH64_ABS64: + emit_rela_abs64(rela, sh_orig_name); + break; + /* Allow 32-bit absolute relocation, for kCFI type hashes. */ + case R_AARCH64_ABS32: + break; + /* Allow position-relative data relocations. */ + case R_AARCH64_PREL64: + case R_AARCH64_PREL32: + case R_AARCH64_PREL16: + case R_AARCH64_PLT32: + break; + /* Allow relocations to generate PC-relative addressing. */ + case R_AARCH64_LD_PREL_LO19: + case R_AARCH64_ADR_PREL_LO21: + case R_AARCH64_ADR_PREL_PG_HI21: + case R_AARCH64_ADR_PREL_PG_HI21_NC: + case R_AARCH64_ADD_ABS_LO12_NC: + case R_AARCH64_LDST8_ABS_LO12_NC: + case R_AARCH64_LDST16_ABS_LO12_NC: + case R_AARCH64_LDST32_ABS_LO12_NC: + case R_AARCH64_LDST64_ABS_LO12_NC: + case R_AARCH64_LDST128_ABS_LO12_NC: + break; + /* Allow relative relocations for control-flow instructions. */ + case R_AARCH64_TSTBR14: + case R_AARCH64_CONDBR19: + case R_AARCH64_JUMP26: + case R_AARCH64_CALL26: + break; + /* Allow group relocations to create PC-relative offset inline. */ + case R_AARCH64_MOVW_PREL_G0: + case R_AARCH64_MOVW_PREL_G0_NC: + case R_AARCH64_MOVW_PREL_G1: + case R_AARCH64_MOVW_PREL_G1_NC: + case R_AARCH64_MOVW_PREL_G2: + case R_AARCH64_MOVW_PREL_G2_NC: + case R_AARCH64_MOVW_PREL_G3: + break; + default: + fatal_error("Unexpected RELA type %u", type); + } + } +} + +/* Iterate over all sections and emit hyp relocation data for RELA sections. */ +static void emit_all_relocs(void) +{ + Elf64_Shdr *shdr; + + for_each_section(shdr) { + switch (elf32toh(shdr->sh_type)) { + case SHT_REL: + fatal_error("Unexpected SHT_REL section \"%s\"", + section_name(shdr)); + case SHT_RELA: + emit_rela_section(shdr); + break; + } + } +} + +int main(int argc, const char **argv) +{ + if (argc != 2) { + fprintf(stderr, "Usage: %s <elf_input>\n", argv[0]); + return EXIT_FAILURE; + } + + init_elf(argv[1]); + + emit_prologue(); + emit_all_relocs(); + emit_epilogue(); + + return EXIT_SUCCESS; +} diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S new file mode 100644 index 000000000000..eef15b374abb --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -0,0 +1,303 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 - Google Inc + * Author: Andrew Scull <ascull@google.com> + */ + +#include <linux/linkage.h> + +#include <asm/assembler.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_ptrauth.h> + + .text + +SYM_FUNC_START(__host_exit) + get_host_ctxt x0, x1 + + /* Store the host regs x2 and x3 */ + stp x2, x3, [x0, #CPU_XREG_OFFSET(2)] + + /* Retrieve the host regs x0-x1 from the stack */ + ldp x2, x3, [sp], #16 // x0, x1 + + /* Store the host regs x0-x1 and x4-x17 */ + stp x2, x3, [x0, #CPU_XREG_OFFSET(0)] + stp x4, x5, [x0, #CPU_XREG_OFFSET(4)] + stp x6, x7, [x0, #CPU_XREG_OFFSET(6)] + stp x8, x9, [x0, #CPU_XREG_OFFSET(8)] + stp x10, x11, [x0, #CPU_XREG_OFFSET(10)] + stp x12, x13, [x0, #CPU_XREG_OFFSET(12)] + stp x14, x15, [x0, #CPU_XREG_OFFSET(14)] + stp x16, x17, [x0, #CPU_XREG_OFFSET(16)] + + /* Store the host regs x18-x29, lr */ + save_callee_saved_regs x0 + + /* Save the host context pointer in x29 across the function call */ + mov x29, x0 + +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL +alternative_if_not ARM64_HAS_ADDRESS_AUTH +b __skip_pauth_save +alternative_else_nop_endif + +alternative_if ARM64_KVM_PROTECTED_MODE + /* Save kernel ptrauth keys. */ + add x18, x29, #CPU_APIAKEYLO_EL1 + ptrauth_save_state x18, x19, x20 + + /* Use hyp keys. */ + adr_this_cpu x18, kvm_hyp_ctxt, x19 + add x18, x18, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state x18, x19, x20 + isb +alternative_else_nop_endif +__skip_pauth_save: +#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ + + bl handle_trap + +__host_enter_restore_full: + /* Restore kernel keys. */ +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL +alternative_if_not ARM64_HAS_ADDRESS_AUTH +b __skip_pauth_restore +alternative_else_nop_endif + +alternative_if ARM64_KVM_PROTECTED_MODE + add x18, x29, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state x18, x19, x20 +alternative_else_nop_endif +__skip_pauth_restore: +#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ + + /* Restore host regs x0-x17 */ + ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] + ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] + ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] + ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)] + + /* x0-7 are use for panic arguments */ +__host_enter_for_panic: + ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)] + ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)] + ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)] + ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)] + ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)] + + /* Restore host regs x18-x29, lr */ + restore_callee_saved_regs x29 + + /* Do not touch any register after this! */ +__host_enter_without_restoring: + eret + sb +SYM_FUNC_END(__host_exit) + +/* + * void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); + */ +SYM_FUNC_START(__host_enter) + mov x29, x0 + b __host_enter_restore_full +SYM_FUNC_END(__host_enter) + +/* + * void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, + * u64 elr, u64 par); + */ +SYM_FUNC_START(__hyp_do_panic) + /* Prepare and exit to the host's panic function. */ + mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ + PSR_MODE_EL1h) + msr spsr_el2, lr + adr_l lr, nvhe_hyp_panic_handler + hyp_kimg_va lr, x6 + msr elr_el2, lr + + mov x29, x0 + +#ifdef CONFIG_NVHE_EL2_DEBUG + /* Ensure host stage-2 is disabled */ + mrs x0, hcr_el2 + bic x0, x0, #HCR_VM + msr_hcr_el2 x0 + isb + tlbi vmalls12e1 + dsb nsh +#endif + + /* Load the panic arguments into x0-7 */ + mrs x0, esr_el2 + mov x4, x3 + mov x3, x2 + hyp_pa x3, x6 + get_vcpu_ptr x5, x6 + mrs x6, far_el2 + mrs x7, hpfar_el2 + + /* Enter the host, conditionally restoring the host context. */ + cbz x29, __host_enter_without_restoring + b __host_enter_for_panic +SYM_FUNC_END(__hyp_do_panic) + +SYM_FUNC_START(__host_hvc) + ldp x0, x1, [sp] // Don't fixup the stack yet + + /* No stub for you, sonny Jim */ +alternative_if ARM64_KVM_PROTECTED_MODE + b __host_exit +alternative_else_nop_endif + + /* Check for a stub HVC call */ + cmp x0, #HVC_STUB_HCALL_NR + b.hs __host_exit + + add sp, sp, #16 + /* + * Compute the idmap address of __kvm_handle_stub_hvc and + * jump there. + * + * Preserve x0-x4, which may contain stub parameters. + */ + adr_l x5, __kvm_handle_stub_hvc + hyp_pa x5, x6 + br x5 +SYM_FUNC_END(__host_hvc) + +.macro host_el1_sync_vect + .align 7 +.L__vect_start\@: + stp x0, x1, [sp, #-16]! + mrs x0, esr_el2 + ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH + cmp x0, #ESR_ELx_EC_HVC64 + b.eq __host_hvc + b __host_exit +.L__vect_end\@: +.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) + .error "host_el1_sync_vect larger than vector entry" +.endif +.endm + +.macro invalid_host_el2_vect + .align 7 + + /* + * Test whether the SP has overflowed, without corrupting a GPR. + * nVHE hypervisor stacks are aligned so that the NVHE_STACK_SHIFT bit + * of SP should always be 1. + */ + add sp, sp, x0 // sp' = sp + x0 + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp + tbz x0, #NVHE_STACK_SHIFT, .L__hyp_sp_overflow\@ + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp + + /* + * The panic may not be clean if the exception is taken before the host + * context has been saved by __host_exit or after the hyp context has + * been partially clobbered by __host_enter. + */ + b hyp_panic + +.L__hyp_sp_overflow\@: + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 + + b hyp_panic_bad_stack + ASM_BUG() +.endm + +.macro invalid_host_el1_vect + .align 7 + mov x0, xzr /* restore_host = false */ + mrs x1, spsr_el2 + mrs x2, elr_el2 + mrs x3, par_el1 + b __hyp_do_panic +.endm + +/* + * The host vector does not use an ESB instruction in order to avoid consuming + * SErrors that should only be consumed by the host. Guest entry is deferred by + * __guest_enter if there are any pending asynchronous exceptions so hyp will + * always return to the host without having consumerd host SErrors. + * + * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the + * host knows about the EL2 vectors already, and there is no point in hiding + * them. + */ + .align 11 +SYM_CODE_START(__kvm_hyp_host_vector) + invalid_host_el2_vect // Synchronous EL2t + invalid_host_el2_vect // IRQ EL2t + invalid_host_el2_vect // FIQ EL2t + invalid_host_el2_vect // Error EL2t + + invalid_host_el2_vect // Synchronous EL2h + invalid_host_el2_vect // IRQ EL2h + invalid_host_el2_vect // FIQ EL2h + invalid_host_el2_vect // Error EL2h + + host_el1_sync_vect // Synchronous 64-bit EL1/EL0 + invalid_host_el1_vect // IRQ 64-bit EL1/EL0 + invalid_host_el1_vect // FIQ 64-bit EL1/EL0 + invalid_host_el1_vect // Error 64-bit EL1/EL0 + + host_el1_sync_vect // Synchronous 32-bit EL1/EL0 + invalid_host_el1_vect // IRQ 32-bit EL1/EL0 + invalid_host_el1_vect // FIQ 32-bit EL1/EL0 + invalid_host_el1_vect // Error 32-bit EL1/EL0 +SYM_CODE_END(__kvm_hyp_host_vector) + +/* + * Forward SMC with arguments in struct kvm_cpu_context, and + * store the result into the same struct. Assumes SMCCC 1.2 or older. + * + * x0: struct kvm_cpu_context* + */ +SYM_CODE_START(__kvm_hyp_host_forward_smc) + /* + * Use x18 to keep the pointer to the host context because + * x18 is callee-saved in SMCCC but not in AAPCS64. + */ + mov x18, x0 + + ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)] + ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)] + ldp x4, x5, [x18, #CPU_XREG_OFFSET(4)] + ldp x6, x7, [x18, #CPU_XREG_OFFSET(6)] + ldp x8, x9, [x18, #CPU_XREG_OFFSET(8)] + ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)] + ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)] + ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)] + ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)] + + smc #0 + + stp x0, x1, [x18, #CPU_XREG_OFFSET(0)] + stp x2, x3, [x18, #CPU_XREG_OFFSET(2)] + stp x4, x5, [x18, #CPU_XREG_OFFSET(4)] + stp x6, x7, [x18, #CPU_XREG_OFFSET(6)] + stp x8, x9, [x18, #CPU_XREG_OFFSET(8)] + stp x10, x11, [x18, #CPU_XREG_OFFSET(10)] + stp x12, x13, [x18, #CPU_XREG_OFFSET(12)] + stp x14, x15, [x18, #CPU_XREG_OFFSET(14)] + stp x16, x17, [x18, #CPU_XREG_OFFSET(16)] + + ret +SYM_CODE_END(__kvm_hyp_host_forward_smc) + +/* + * kvm_host_psci_cpu_entry is called through br instruction, which requires + * bti j instruction as compilers (gcc and llvm) doesn't insert bti j for external + * functions, but bti c instead. + */ +SYM_CODE_START(kvm_host_psci_cpu_entry) + bti j + b __kvm_host_psci_cpu_entry +SYM_CODE_END(kvm_host_psci_cpu_entry) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S new file mode 100644 index 000000000000..aada42522e7b --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/arm-smccc.h> +#include <linux/cfi_types.h> +#include <linux/linkage.h> + +#include <asm/alternative.h> +#include <asm/assembler.h> +#include <asm/el2_setup.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> +#include <asm/pgtable-hwdef.h> +#include <asm/sysreg.h> +#include <asm/virt.h> + + .text + .pushsection .idmap.text, "ax" + + .align 11 + +SYM_CODE_START(__kvm_hyp_init) + ventry . // Synchronous EL2t + ventry . // IRQ EL2t + ventry . // FIQ EL2t + ventry . // Error EL2t + + ventry . // Synchronous EL2h + ventry . // IRQ EL2h + ventry . // FIQ EL2h + ventry . // Error EL2h + + ventry __do_hyp_init // Synchronous 64-bit EL1 + ventry . // IRQ 64-bit EL1 + ventry . // FIQ 64-bit EL1 + ventry . // Error 64-bit EL1 + + ventry . // Synchronous 32-bit EL1 + ventry . // IRQ 32-bit EL1 + ventry . // FIQ 32-bit EL1 + ventry . // Error 32-bit EL1 + + /* + * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers. + * + * x0: SMCCC function ID + * x1: struct kvm_nvhe_init_params PA + */ +__do_hyp_init: + /* Check for a stub HVC call */ + cmp x0, #HVC_STUB_HCALL_NR + b.lo __kvm_handle_stub_hvc + + bic x0, x0, #ARM_SMCCC_CALL_HINTS + mov x3, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) + cmp x0, x3 + b.eq 1f + + mov x0, #SMCCC_RET_NOT_SUPPORTED + eret + +1: mov x0, x1 + mov x3, lr + bl ___kvm_hyp_init // Clobbers x0..x2 + mov lr, x3 + + /* Hello, World! */ + mov x0, #SMCCC_RET_SUCCESS + eret +SYM_CODE_END(__kvm_hyp_init) + +/* + * Initialize EL2 CPU state to sane values. + * + * HCR_EL2.E2H must have been initialized already. + */ +SYM_CODE_START_LOCAL(__kvm_init_el2_state) + init_el2_state // Clobbers x0..x2 + finalise_el2_state + ret +SYM_CODE_END(__kvm_init_el2_state) + +/* + * Initialize the hypervisor in EL2. + * + * Only uses x0..x2 so as to not clobber callee-saved SMCCC registers + * and leave x3 for the caller. + * + * x0: struct kvm_nvhe_init_params PA + */ +SYM_CODE_START_LOCAL(___kvm_hyp_init) + ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA] + mov sp, x1 + + ldr x1, [x0, #NVHE_INIT_MAIR_EL2] + msr mair_el2, x1 + + ldr x1, [x0, #NVHE_INIT_HCR_EL2] + msr_hcr_el2 x1 + + mov x2, #HCR_E2H + and x2, x1, x2 + cbz x2, 1f + + // hVHE: Replay the EL2 setup to account for the E2H bit + // TPIDR_EL2 is used to preserve x0 across the macro maze... + isb + msr tpidr_el2, x0 + str lr, [x0, #NVHE_INIT_TMP] + + bl __kvm_init_el2_state + + mrs x0, tpidr_el2 + ldr lr, [x0, #NVHE_INIT_TMP] + +1: + ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] + msr tpidr_el2, x1 + + ldr x1, [x0, #NVHE_INIT_VTTBR] + msr vttbr_el2, x1 + + ldr x1, [x0, #NVHE_INIT_VTCR] + msr vtcr_el2, x1 + + ldr x1, [x0, #NVHE_INIT_PGD_PA] + phys_to_ttbr x2, x1 +alternative_if ARM64_HAS_CNP + orr x2, x2, #TTBR_CNP_BIT +alternative_else_nop_endif + msr ttbr0_el2, x2 + + ldr x0, [x0, #NVHE_INIT_TCR_EL2] + msr tcr_el2, x0 + + isb + + /* Invalidate the stale TLBs from Bootloader */ + tlbi alle2 + tlbi alle1 + dsb sy + + mov_q x0, INIT_SCTLR_EL2_MMU_ON +alternative_if ARM64_HAS_ADDRESS_AUTH + mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ + SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) + orr x0, x0, x1 +alternative_else_nop_endif + +#ifdef CONFIG_ARM64_BTI_KERNEL +alternative_if ARM64_BTI + orr x0, x0, #SCTLR_EL2_BT +alternative_else_nop_endif +#endif /* CONFIG_ARM64_BTI_KERNEL */ + + msr sctlr_el2, x0 + isb + + /* Set the host vector */ + ldr x0, =__kvm_hyp_host_vector + msr vbar_el2, x0 + + ret +SYM_CODE_END(___kvm_hyp_init) + +/* + * PSCI CPU_ON entry point + * + * x0: struct kvm_nvhe_init_params PA + */ +SYM_CODE_START(kvm_hyp_cpu_entry) + mov x1, #1 // is_cpu_on = true + b __kvm_hyp_init_cpu +SYM_CODE_END(kvm_hyp_cpu_entry) + +/* + * PSCI CPU_SUSPEND / SYSTEM_SUSPEND entry point + * + * x0: struct kvm_nvhe_init_params PA + */ +SYM_CODE_START(kvm_hyp_cpu_resume) + mov x1, #0 // is_cpu_on = false + b __kvm_hyp_init_cpu +SYM_CODE_END(kvm_hyp_cpu_resume) + +/* + * Common code for CPU entry points. Initializes EL2 state and + * installs the hypervisor before handing over to a C handler. + * + * x0: struct kvm_nvhe_init_params PA + * x1: bool is_cpu_on + */ +SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) + mov x28, x0 // Stash arguments + mov x29, x1 + + /* Check that the core was booted in EL2. */ + mrs x0, CurrentEL + cmp x0, #CurrentEL_EL2 + b.eq 2f + + /* The core booted in EL1. KVM cannot be initialized on it. */ +1: wfe + wfi + b 1b + +2: msr SPsel, #1 // We want to use SP_EL{1,2} + + init_el2_hcr 0 + + bl __kvm_init_el2_state + + /* Enable MMU, set vectors and stack. */ + mov x0, x28 + bl ___kvm_hyp_init // Clobbers x0..x2 + + /* Leave idmap. */ + mov x0, x29 + ldr x1, =kvm_host_psci_cpu_entry + br x1 +SYM_CODE_END(__kvm_hyp_init_cpu) + +SYM_CODE_START(__kvm_handle_stub_hvc) + /* + * __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so + * we need bti j at beginning. + */ + bti j + cmp x0, #HVC_SOFT_RESTART + b.ne 1f + + /* This is where we're about to jump, staying at EL2 */ + msr elr_el2, x1 + mov x0, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT | PSR_MODE_EL2h) + msr spsr_el2, x0 + + /* Shuffle the arguments, and don't come back */ + mov x0, x2 + mov x1, x3 + mov x2, x4 + b reset + +1: cmp x0, #HVC_RESET_VECTORS + b.ne 1f + + /* + * Set the HVC_RESET_VECTORS return code before entering the common + * path so that we do not clobber x0-x2 in case we are coming via + * HVC_SOFT_RESTART. + */ + mov x0, xzr +reset: + /* Reset kvm back to the hyp stub. */ + mov_q x5, INIT_SCTLR_EL2_MMU_OFF + pre_disable_mmu_workaround + msr sctlr_el2, x5 + isb + +alternative_if ARM64_KVM_PROTECTED_MODE + mov_q x5, HCR_HOST_NVHE_FLAGS + msr_hcr_el2 x5 +alternative_else_nop_endif + + /* Install stub vectors */ + adr_l x5, __hyp_stub_vectors + msr vbar_el2, x5 + eret + +1: /* Bad stub call */ + mov_q x0, HVC_STUB_ERR + eret + +SYM_CODE_END(__kvm_handle_stub_hvc) + +/* + * void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp, + * void (*fn)(void)); + * + * SYM_TYPED_FUNC_START() allows C to call this ID-mapped function indirectly + * using a physical pointer without triggering a kCFI failure. + */ +SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd) + /* Turn the MMU off */ + pre_disable_mmu_workaround + mrs x3, sctlr_el2 + bic x4, x3, #SCTLR_ELx_M + msr sctlr_el2, x4 + isb + + tlbi alle2 + + /* Install the new pgtables */ + phys_to_ttbr x5, x0 +alternative_if ARM64_HAS_CNP + orr x5, x5, #TTBR_CNP_BIT +alternative_else_nop_endif + msr ttbr0_el2, x5 + + /* Set the new stack pointer */ + mov sp, x1 + + /* And turn the MMU back on! */ + dsb nsh + isb + set_sctlr_el2 x3 + ret x2 +SYM_FUNC_END(__pkvm_init_switch_pgd) + + .popsection diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c new file mode 100644 index 000000000000..a7c689152f68 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -0,0 +1,708 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google Inc + * Author: Andrew Scull <ascull@google.com> + */ + +#include <hyp/adjust_pc.h> +#include <hyp/switch.h> + +#include <asm/pgtable-types.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_host.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +#include <nvhe/ffa.h> +#include <nvhe/mem_protect.h> +#include <nvhe/mm.h> +#include <nvhe/pkvm.h> +#include <nvhe/trap_handler.h> + +DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); + +void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); + +static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) +{ + __vcpu_assign_sys_reg(vcpu, ZCR_EL1, read_sysreg_el1(SYS_ZCR)); + /* + * On saving/restoring guest sve state, always use the maximum VL for + * the guest. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) guest VL. + */ + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); +} + +static void __hyp_sve_restore_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + /* + * On saving/restoring host sve state, always use the maximum VL for + * the host. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) host VL. + * + * Note that this constrains the PE to the maximum shared VL + * that was discovered, if we wish to use larger VLs this will + * need to be revisited. + */ + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); + __sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); + write_sysreg_el1(sve_state->zcr_el1, SYS_ZCR); +} + +static void fpsimd_sve_flush(void) +{ + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + +static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) +{ + bool has_fpmr; + + if (!guest_owns_fp_regs()) + return; + + /* + * Traps have been disabled by __deactivate_cptr_traps(), but there + * hasn't necessarily been a context synchronization event yet. + */ + isb(); + + if (vcpu_has_sve(vcpu)) + __hyp_sve_save_guest(vcpu); + else + __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + + has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); + if (has_fpmr) + __vcpu_assign_sys_reg(vcpu, FPMR, read_sysreg_s(SYS_FPMR)); + + if (system_supports_sve()) + __hyp_sve_restore_host(); + else + __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); + + if (has_fpmr) + write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); + + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + +static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state; +} + +static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state; +} + +static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + fpsimd_sve_flush(); + flush_debug_state(hyp_vcpu); + + hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; + + hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); + hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & + (HCR_TWI | HCR_TWE); + + hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; + + hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; + + hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3; +} + +static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + struct vgic_v3_cpu_if *hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3; + struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; + unsigned int i; + + fpsimd_sve_sync(&hyp_vcpu->vcpu); + sync_debug_state(hyp_vcpu); + + host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; + + host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2; + + host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault; + + host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; + + host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr; + host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr; + for (i = 0; i < hyp_cpu_if->used_lrs; ++i) + host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; +} + +static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2); + DECLARE_REG(u64, hcr_el2, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx); + if (!hyp_vcpu) + return; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + /* Propagate WFx trapping flags */ + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI); + hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI); + } +} + +static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (hyp_vcpu) + pkvm_put_hyp_vcpu(hyp_vcpu); +} + +static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1); + int ret; + + if (unlikely(is_protected_kvm_enabled())) { + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + + /* + * KVM (and pKVM) doesn't support SME guests for now, and + * ensures that SME features aren't enabled in pstate when + * loading a vcpu. Therefore, if SME features enabled the host + * is misbehaving. + */ + if (unlikely(system_supports_sme() && read_sysreg_s(SYS_SVCR))) { + ret = -EINVAL; + goto out; + } + + if (!hyp_vcpu) { + ret = -EINVAL; + goto out; + } + + flush_hyp_vcpu(hyp_vcpu); + + ret = __kvm_vcpu_run(&hyp_vcpu->vcpu); + + sync_hyp_vcpu(hyp_vcpu); + } else { + struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu); + + /* The host is fully trusted, run its vCPU directly. */ + fpsimd_lazy_switch_to_guest(vcpu); + ret = __kvm_vcpu_run(vcpu); + fpsimd_lazy_switch_to_host(vcpu); + } +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, + host_vcpu->arch.pkvm_memcache.nr_pages, + &host_vcpu->arch.pkvm_memcache); +} + +static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = pkvm_refill_memcache(hyp_vcpu); + if (ret) + goto out; + + ret = __pkvm_host_share_guest(pfn, gfn, nr_pages, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_unshare_guest(gfn, nr_pages, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 2); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_relax_perms_guest(gfn, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_wrprotect_guest(gfn, nr_pages, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(bool, mkold, host_ctxt, 4); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_test_clear_young_guest(gfn, nr_pages, mkold, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_mkyoung_guest(gfn, hyp_vcpu); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + + __kvm_adjust_pc(kern_hyp_va(vcpu)); +} + +static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt) +{ + __kvm_flush_vm_context(); +} + +static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2); + DECLARE_REG(int, level, host_ctxt, 3); + + __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); +} + +static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2); + DECLARE_REG(int, level, host_ctxt, 3); + + __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level); +} + +static void +handle___kvm_tlb_flush_vmid_range(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, start, host_ctxt, 2); + DECLARE_REG(unsigned long, pages, host_ctxt, 3); + + __kvm_tlb_flush_vmid_range(kern_hyp_va(mmu), start, pages); +} + +static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + + __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); +} + +static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + struct pkvm_hyp_vm *hyp_vm; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + return; + + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + put_pkvm_hyp_vm(hyp_vm); +} + +static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + + __kvm_flush_cpu_context(kern_hyp_va(mmu)); +} + +static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt) +{ + __kvm_timer_set_cntvoff(cpu_reg(host_ctxt, 1)); +} + +static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt) +{ + u64 tmp; + + tmp = read_sysreg_el2(SYS_SCTLR); + tmp |= SCTLR_ELx_DSSBS; + write_sysreg_el2(tmp, SYS_SCTLR); +} + +static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); +} + +static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) +{ + __vgic_v3_init_lrs(); +} + +static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); +} + +static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v3_restore_vmcr_aprs(kern_hyp_va(cpu_if)); +} + +static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); + DECLARE_REG(unsigned long, size, host_ctxt, 2); + DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3); + DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4); + DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5); + + /* + * __pkvm_init() will return only if an error occurred, otherwise it + * will tail-call in __pkvm_init_finalise() which will have to deal + * with the host context directly. + */ + cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base, + hyp_va_bits); +} + +static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(enum arm64_hyp_spectre_vector, slot, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot); +} + +static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn); +} + +static void handle___pkvm_host_unshare_hyp(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_host_unshare_hyp(pfn); +} + +static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); + DECLARE_REG(size_t, size, host_ctxt, 2); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); + + /* + * __pkvm_create_private_mapping() populates a pointer with the + * hypervisor start address of the allocation. + * + * However, handle___pkvm_create_private_mapping() hypercall crosses the + * EL1/EL2 boundary so the pointer would not be valid in this context. + * + * Instead pass the allocation address as the return value (or return + * ERR_PTR() on failure). + */ + unsigned long haddr; + int err = __pkvm_create_private_mapping(phys, size, prot, &haddr); + + if (err) + haddr = (unsigned long)ERR_PTR(err); + + cpu_reg(host_ctxt, 1) = haddr; +} + +static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); +} + +static void handle___pkvm_reserve_vm(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __pkvm_reserve_vm(); +} + +static void handle___pkvm_unreserve_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + __pkvm_unreserve_vm(handle); +} + +static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); + DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2); + DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3); + + host_kvm = kern_hyp_va(host_kvm); + cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva); +} + +static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 2); + DECLARE_REG(unsigned long, vcpu_hva, host_ctxt, 3); + + host_vcpu = kern_hyp_va(host_vcpu); + cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva); +} + +static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle); +} + +typedef void (*hcall_t)(struct kvm_cpu_context *); + +#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x + +static const hcall_t host_hcall[] = { + /* ___kvm_hyp_init */ + HANDLE_FUNC(__pkvm_init), + HANDLE_FUNC(__pkvm_create_private_mapping), + HANDLE_FUNC(__pkvm_cpu_set_vector), + HANDLE_FUNC(__kvm_enable_ssbs), + HANDLE_FUNC(__vgic_v3_init_lrs), + HANDLE_FUNC(__vgic_v3_get_gic_config), + HANDLE_FUNC(__pkvm_prot_finalize), + + HANDLE_FUNC(__pkvm_host_share_hyp), + HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_share_guest), + HANDLE_FUNC(__pkvm_host_unshare_guest), + HANDLE_FUNC(__pkvm_host_relax_perms_guest), + HANDLE_FUNC(__pkvm_host_wrprotect_guest), + HANDLE_FUNC(__pkvm_host_test_clear_young_guest), + HANDLE_FUNC(__pkvm_host_mkyoung_guest), + HANDLE_FUNC(__kvm_adjust_pc), + HANDLE_FUNC(__kvm_vcpu_run), + HANDLE_FUNC(__kvm_flush_vm_context), + HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), + HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh), + HANDLE_FUNC(__kvm_tlb_flush_vmid), + HANDLE_FUNC(__kvm_tlb_flush_vmid_range), + HANDLE_FUNC(__kvm_flush_cpu_context), + HANDLE_FUNC(__kvm_timer_set_cntvoff), + HANDLE_FUNC(__vgic_v3_save_aprs), + HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), + HANDLE_FUNC(__pkvm_reserve_vm), + HANDLE_FUNC(__pkvm_unreserve_vm), + HANDLE_FUNC(__pkvm_init_vm), + HANDLE_FUNC(__pkvm_init_vcpu), + HANDLE_FUNC(__pkvm_teardown_vm), + HANDLE_FUNC(__pkvm_vcpu_load), + HANDLE_FUNC(__pkvm_vcpu_put), + HANDLE_FUNC(__pkvm_tlb_flush_vmid), +}; + +static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, id, host_ctxt, 0); + unsigned long hcall_min = 0; + hcall_t hfn; + + /* + * If pKVM has been initialised then reject any calls to the + * early "privileged" hypercalls. Note that we cannot reject + * calls to __pkvm_prot_finalize for two reasons: (1) The static + * key used to determine initialisation must be toggled prior to + * finalisation and (2) finalisation is performed on a per-CPU + * basis. This is all fine, however, since __pkvm_prot_finalize + * returns -EPERM after the first call for a given CPU. + */ + if (static_branch_unlikely(&kvm_protected_mode_initialized)) + hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize; + + id &= ~ARM_SMCCC_CALL_HINTS; + id -= KVM_HOST_SMCCC_ID(0); + + if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall))) + goto inval; + + hfn = host_hcall[id]; + if (unlikely(!hfn)) + goto inval; + + cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS; + hfn(host_ctxt); + + return; +inval: + cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED; +} + +static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt) +{ + __kvm_hyp_host_forward_smc(host_ctxt); +} + +static void handle_host_smc(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, func_id, host_ctxt, 0); + bool handled; + + func_id &= ~ARM_SMCCC_CALL_HINTS; + + handled = kvm_host_psci_handler(host_ctxt, func_id); + if (!handled) + handled = kvm_host_ffa_handler(host_ctxt, func_id); + if (!handled) + default_host_smc_handler(host_ctxt); + + /* SMC was trapped, move ELR past the current PC. */ + kvm_skip_host_instr(); +} + +void handle_trap(struct kvm_cpu_context *host_ctxt) +{ + u64 esr = read_sysreg_el2(SYS_ESR); + + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_HVC64: + handle_host_hcall(host_ctxt); + break; + case ESR_ELx_EC_SMC64: + handle_host_smc(host_ctxt); + break; + case ESR_ELx_EC_IABT_LOW: + case ESR_ELx_EC_DABT_LOW: + handle_host_mem_abort(host_ctxt); + break; + default: + BUG(); + } +} diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c new file mode 100644 index 000000000000..04d194583f1e --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google LLC + * Author: David Brazdil <dbrazdil@google.com> + */ + +#include <asm/kvm_asm.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +/* + * nVHE copy of data structures tracking available CPU cores. + * Only entries for CPUs that were online at KVM init are populated. + * Other CPUs should not be allowed to boot because their features were + * not checked against the finalized system capabilities. + */ +u64 __ro_after_init hyp_cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; + +u64 cpu_logical_map(unsigned int cpu) +{ + BUG_ON(cpu >= ARRAY_SIZE(hyp_cpu_logical_map)); + + return hyp_cpu_logical_map[cpu]; +} + +unsigned long __ro_after_init kvm_arm_hyp_percpu_base[NR_CPUS]; + +unsigned long __hyp_per_cpu_offset(unsigned int cpu) +{ + unsigned long *cpu_base_array; + unsigned long this_cpu_base; + unsigned long elf_base; + + BUG_ON(cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base)); + + cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base; + this_cpu_base = kern_hyp_va(cpu_base_array[cpu]); + elf_base = (unsigned long)&__per_cpu_start; + return this_cpu_base - elf_base; +} diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S new file mode 100644 index 000000000000..d724f6d69302 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + * Written by David Brazdil <dbrazdil@google.com> + * + * Linker script used for partial linking of nVHE EL2 object files. + */ + +#include <asm/hyp_image.h> +#include <asm-generic/vmlinux.lds.h> +#include <asm/cache.h> +#include <asm/memory.h> + +SECTIONS { + HYP_SECTION(.idmap.text) + HYP_SECTION(.text) + HYP_SECTION(.data..ro_after_init) + HYP_SECTION(.rodata) + + /* + * .hyp..data..percpu needs to be page aligned to maintain the same + * alignment for when linking into vmlinux. + */ + . = ALIGN(PAGE_SIZE); + BEGIN_HYP_SECTION(.data..percpu) + PERCPU_INPUT(L1_CACHE_BYTES) + END_HYP_SECTION + + HYP_SECTION(.bss) + HYP_SECTION(.data) +} diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c new file mode 100644 index 000000000000..baa6260f88dc --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 - Google LLC + * Author: Keir Fraser <keirf@google.com> + */ + +#include <linux/list.h> +#include <linux/bug.h> + +static inline __must_check bool nvhe_check_data_corruption(bool v) +{ + return v; +} + +#define NVHE_CHECK_DATA_CORRUPTION(condition) \ + nvhe_check_data_corruption(({ \ + bool corruption = unlikely(condition); \ + if (corruption) { \ + if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ + BUG(); \ + } else \ + WARN_ON(1); \ + } \ + corruption; \ + })) + +/* The predicates checked here are taken from lib/list_debug.c. */ + +__list_valid_slowpath +bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev, + struct list_head *next) +{ + if (NVHE_CHECK_DATA_CORRUPTION(next->prev != prev) || + NVHE_CHECK_DATA_CORRUPTION(prev->next != next) || + NVHE_CHECK_DATA_CORRUPTION(new == prev || new == next)) + return false; + + return true; +} + +__list_valid_slowpath +bool __list_del_entry_valid_or_report(struct list_head *entry) +{ + struct list_head *prev, *next; + + prev = entry->prev; + next = entry->next; + + if (NVHE_CHECK_DATA_CORRUPTION(next == LIST_POISON1) || + NVHE_CHECK_DATA_CORRUPTION(prev == LIST_POISON2) || + NVHE_CHECK_DATA_CORRUPTION(prev->next != entry) || + NVHE_CHECK_DATA_CORRUPTION(next->prev != entry)) + return false; + + return true; +} diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c new file mode 100644 index 000000000000..49db32f3ddf7 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -0,0 +1,1396 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> +#include <asm/stage2_pgtable.h> + +#include <hyp/fault.h> + +#include <nvhe/gfp.h> +#include <nvhe/memory.h> +#include <nvhe/mem_protect.h> +#include <nvhe/mm.h> + +#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) + +struct host_mmu host_mmu; + +static struct hyp_pool host_s2_pool; + +static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm); +#define current_vm (*this_cpu_ptr(&__current_vm)) + +static void guest_lock_component(struct pkvm_hyp_vm *vm) +{ + hyp_spin_lock(&vm->lock); + current_vm = vm; +} + +static void guest_unlock_component(struct pkvm_hyp_vm *vm) +{ + current_vm = NULL; + hyp_spin_unlock(&vm->lock); +} + +static void host_lock_component(void) +{ + hyp_spin_lock(&host_mmu.lock); +} + +static void host_unlock_component(void) +{ + hyp_spin_unlock(&host_mmu.lock); +} + +static void hyp_lock_component(void) +{ + hyp_spin_lock(&pkvm_pgd_lock); +} + +static void hyp_unlock_component(void) +{ + hyp_spin_unlock(&pkvm_pgd_lock); +} + +#define for_each_hyp_page(__p, __st, __sz) \ + for (struct hyp_page *__p = hyp_phys_to_page(__st), \ + *__e = __p + ((__sz) >> PAGE_SHIFT); \ + __p < __e; __p++) + +static void *host_s2_zalloc_pages_exact(size_t size) +{ + void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size)); + + hyp_split_page(hyp_virt_to_page(addr)); + + /* + * The size of concatenated PGDs is always a power of two of PAGE_SIZE, + * so there should be no need to free any of the tail pages to make the + * allocation exact. + */ + WARN_ON(size != (PAGE_SIZE << get_order(size))); + + return addr; +} + +static void *host_s2_zalloc_page(void *pool) +{ + return hyp_alloc_pages(pool, 0); +} + +static void host_s2_get_page(void *addr) +{ + hyp_get_page(&host_s2_pool, addr); +} + +static void host_s2_put_page(void *addr) +{ + hyp_put_page(&host_s2_pool, addr); +} + +static void host_s2_free_unlinked_table(void *addr, s8 level) +{ + kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level); +} + +static int prepare_s2_pool(void *pgt_pool_base) +{ + unsigned long nr_pages, pfn; + int ret; + + pfn = hyp_virt_to_pfn(pgt_pool_base); + nr_pages = host_s2_pgtable_pages(); + ret = hyp_pool_init(&host_s2_pool, pfn, nr_pages, 0); + if (ret) + return ret; + + host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) { + .zalloc_pages_exact = host_s2_zalloc_pages_exact, + .zalloc_page = host_s2_zalloc_page, + .free_unlinked_table = host_s2_free_unlinked_table, + .phys_to_virt = hyp_phys_to_virt, + .virt_to_phys = hyp_virt_to_phys, + .page_count = hyp_page_count, + .get_page = host_s2_get_page, + .put_page = host_s2_put_page, + }; + + return 0; +} + +static void prepare_host_vtcr(void) +{ + u32 parange, phys_shift; + + /* The host stage 2 is id-mapped, so use parange for T0SZ */ + parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val); + phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); + + host_mmu.arch.mmu.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, + id_aa64mmfr1_el1_sys_val, phys_shift); +} + +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot); + +int kvm_host_prepare_stage2(void *pgt_pool_base) +{ + struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu; + int ret; + + prepare_host_vtcr(); + hyp_spin_lock_init(&host_mmu.lock); + mmu->arch = &host_mmu.arch; + + ret = prepare_s2_pool(pgt_pool_base); + if (ret) + return ret; + + ret = __kvm_pgtable_stage2_init(&host_mmu.pgt, mmu, + &host_mmu.mm_ops, KVM_HOST_S2_FLAGS, + host_stage2_force_pte_cb); + if (ret) + return ret; + + mmu->pgd_phys = __hyp_pa(host_mmu.pgt.pgd); + mmu->pgt = &host_mmu.pgt; + atomic64_set(&mmu->vmid.id, 0); + + return 0; +} + +static void *guest_s2_zalloc_pages_exact(size_t size) +{ + void *addr = hyp_alloc_pages(¤t_vm->pool, get_order(size)); + + WARN_ON(size != (PAGE_SIZE << get_order(size))); + hyp_split_page(hyp_virt_to_page(addr)); + + return addr; +} + +static void guest_s2_free_pages_exact(void *addr, unsigned long size) +{ + u8 order = get_order(size); + unsigned int i; + + for (i = 0; i < (1 << order); i++) + hyp_put_page(¤t_vm->pool, addr + (i * PAGE_SIZE)); +} + +static void *guest_s2_zalloc_page(void *mc) +{ + struct hyp_page *p; + void *addr; + + addr = hyp_alloc_pages(¤t_vm->pool, 0); + if (addr) + return addr; + + addr = pop_hyp_memcache(mc, hyp_phys_to_virt); + if (!addr) + return addr; + + memset(addr, 0, PAGE_SIZE); + p = hyp_virt_to_page(addr); + p->refcount = 1; + p->order = 0; + + return addr; +} + +static void guest_s2_get_page(void *addr) +{ + hyp_get_page(¤t_vm->pool, addr); +} + +static void guest_s2_put_page(void *addr) +{ + hyp_put_page(¤t_vm->pool, addr); +} + +static void __apply_guest_page(void *va, size_t size, + void (*func)(void *addr, size_t size)) +{ + size += va - PTR_ALIGN_DOWN(va, PAGE_SIZE); + va = PTR_ALIGN_DOWN(va, PAGE_SIZE); + size = PAGE_ALIGN(size); + + while (size) { + size_t map_size = PAGE_SIZE; + void *map; + + if (IS_ALIGNED((unsigned long)va, PMD_SIZE) && size >= PMD_SIZE) + map = hyp_fixblock_map(__hyp_pa(va), &map_size); + else + map = hyp_fixmap_map(__hyp_pa(va)); + + func(map, map_size); + + if (map_size == PMD_SIZE) + hyp_fixblock_unmap(); + else + hyp_fixmap_unmap(); + + size -= map_size; + va += map_size; + } +} + +static void clean_dcache_guest_page(void *va, size_t size) +{ + __apply_guest_page(va, size, __clean_dcache_guest_page); +} + +static void invalidate_icache_guest_page(void *va, size_t size) +{ + __apply_guest_page(va, size, __invalidate_icache_guest_page); +} + +int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) +{ + struct kvm_s2_mmu *mmu = &vm->kvm.arch.mmu; + unsigned long nr_pages; + int ret; + + nr_pages = kvm_pgtable_stage2_pgd_size(mmu->vtcr) >> PAGE_SHIFT; + ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0); + if (ret) + return ret; + + hyp_spin_lock_init(&vm->lock); + vm->mm_ops = (struct kvm_pgtable_mm_ops) { + .zalloc_pages_exact = guest_s2_zalloc_pages_exact, + .free_pages_exact = guest_s2_free_pages_exact, + .zalloc_page = guest_s2_zalloc_page, + .phys_to_virt = hyp_phys_to_virt, + .virt_to_phys = hyp_virt_to_phys, + .page_count = hyp_page_count, + .get_page = guest_s2_get_page, + .put_page = guest_s2_put_page, + .dcache_clean_inval_poc = clean_dcache_guest_page, + .icache_inval_pou = invalidate_icache_guest_page, + }; + + guest_lock_component(vm); + ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, NULL); + guest_unlock_component(vm); + if (ret) + return ret; + + vm->kvm.arch.mmu.pgd_phys = __hyp_pa(vm->pgt.pgd); + + return 0; +} + +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) +{ + struct hyp_page *page; + void *addr; + + /* Dump all pgtable pages in the hyp_pool */ + guest_lock_component(vm); + kvm_pgtable_stage2_destroy(&vm->pgt); + vm->kvm.arch.mmu.pgd_phys = 0ULL; + guest_unlock_component(vm); + + /* Drain the hyp_pool into the memcache */ + addr = hyp_alloc_pages(&vm->pool, 0); + while (addr) { + page = hyp_virt_to_page(addr); + page->refcount = 0; + page->order = 0; + push_hyp_memcache(mc, addr, hyp_virt_to_phys); + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); + addr = hyp_alloc_pages(&vm->pool, 0); + } +} + +int __pkvm_prot_finalize(void) +{ + struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu; + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + + if (params->hcr_el2 & HCR_VM) + return -EPERM; + + params->vttbr = kvm_get_vttbr(mmu); + params->vtcr = mmu->vtcr; + params->hcr_el2 |= HCR_VM; + + /* + * The CMO below not only cleans the updated params to the + * PoC, but also provides the DSB that ensures ongoing + * page-table walks that have started before we trapped to EL2 + * have completed. + */ + kvm_flush_dcache_to_poc(params, sizeof(*params)); + + write_sysreg_hcr(params->hcr_el2); + __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); + + /* + * Make sure to have an ISB before the TLB maintenance below but only + * when __load_stage2() doesn't include one already. + */ + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); + + /* Invalidate stale HCR bits that may be cached in TLBs */ + __tlbi(vmalls12e1); + dsb(nsh); + isb(); + + return 0; +} + +static int host_stage2_unmap_dev_all(void) +{ + struct kvm_pgtable *pgt = &host_mmu.pgt; + struct memblock_region *reg; + u64 addr = 0; + int i, ret; + + /* Unmap all non-memory regions to recycle the pages */ + for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) { + reg = &hyp_memory[i]; + ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr); + if (ret) + return ret; + } + return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); +} + +/* + * Ensure the PFN range is contained within PA-range. + * + * This check is also robust to overflows and is therefore a requirement before + * using a pfn/nr_pages pair from an untrusted source. + */ +static bool pfn_range_is_valid(u64 pfn, u64 nr_pages) +{ + u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT); + + return pfn < limit && ((limit - pfn) >= nr_pages); +} + +struct kvm_mem_range { + u64 start; + u64 end; +}; + +static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) +{ + int cur, left = 0, right = hyp_memblock_nr; + struct memblock_region *reg; + phys_addr_t end; + + range->start = 0; + range->end = ULONG_MAX; + + /* The list of memblock regions is sorted, binary search it */ + while (left < right) { + cur = (left + right) >> 1; + reg = &hyp_memory[cur]; + end = reg->base + reg->size; + if (addr < reg->base) { + right = cur; + range->end = reg->base; + } else if (addr >= end) { + left = cur + 1; + range->start = end; + } else { + range->start = reg->base; + range->end = end; + return reg; + } + } + + return NULL; +} + +bool addr_is_memory(phys_addr_t phys) +{ + struct kvm_mem_range range; + + return !!find_mem_range(phys, &range); +} + +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + +static int check_range_allowed_memory(u64 start, u64 end) +{ + struct memblock_region *reg; + struct kvm_mem_range range; + + /* + * Callers can't check the state of a range that overlaps memory and + * MMIO regions, so ensure [start, end[ is in the same kvm_mem_range. + */ + reg = find_mem_range(start, &range); + if (!is_in_mem_range(end - 1, &range)) + return -EINVAL; + + if (!reg || reg->flags & MEMBLOCK_NOMAP) + return -EPERM; + + return 0; +} + +static bool range_is_memory(u64 start, u64 end) +{ + struct kvm_mem_range r; + + if (!find_mem_range(start, &r)) + return false; + + return is_in_mem_range(end - 1, &r); +} + +static inline int __host_stage2_idmap(u64 start, u64 end, + enum kvm_pgtable_prot prot) +{ + return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start, + prot, &host_s2_pool, 0); +} + +/* + * The pool has been provided with enough pages to cover all of memory with + * page granularity, but it is difficult to know how much of the MMIO range + * we will need to cover upfront, so we may need to 'recycle' the pages if we + * run out. + */ +#define host_stage2_try(fn, ...) \ + ({ \ + int __ret; \ + hyp_assert_lock_held(&host_mmu.lock); \ + __ret = fn(__VA_ARGS__); \ + if (__ret == -ENOMEM) { \ + __ret = host_stage2_unmap_dev_all(); \ + if (!__ret) \ + __ret = fn(__VA_ARGS__); \ + } \ + __ret; \ + }) + +static inline bool range_included(struct kvm_mem_range *child, + struct kvm_mem_range *parent) +{ + return parent->start <= child->start && child->end <= parent->end; +} + +static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) +{ + struct kvm_mem_range cur; + kvm_pte_t pte; + u64 granule; + s8 level; + int ret; + + hyp_assert_lock_held(&host_mmu.lock); + ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level); + if (ret) + return ret; + + if (kvm_pte_valid(pte)) + return -EAGAIN; + + if (pte) { + WARN_ON(addr_is_memory(addr) && + get_host_state(hyp_phys_to_page(addr)) != PKVM_NOPAGE); + return -EPERM; + } + + for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) { + if (!kvm_level_supports_block_mapping(level)) + continue; + granule = kvm_granule_size(level); + cur.start = ALIGN_DOWN(addr, granule); + cur.end = cur.start + granule; + if (!range_included(&cur, range)) + continue; + *range = cur; + return 0; + } + + WARN_ON(1); + + return -EINVAL; +} + +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, + enum kvm_pgtable_prot prot) +{ + return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); +} + +static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state) +{ + for_each_hyp_page(page, addr, size) + set_host_state(page, state); +} + +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +{ + int ret; + + if (!range_is_memory(addr, addr + size)) + return -EPERM; + + ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, + addr, size, &host_s2_pool, owner_id); + if (ret) + return ret; + + /* Don't forget to update the vmemmap tracking for the host */ + if (owner_id == PKVM_ID_HOST) + __host_update_page_state(addr, size, PKVM_PAGE_OWNED); + else + __host_update_page_state(addr, size, PKVM_NOPAGE); + + return 0; +} + +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) +{ + /* + * Block mappings must be used with care in the host stage-2 as a + * kvm_pgtable_stage2_map() operation targeting a page in the range of + * an existing block will delete the block under the assumption that + * mappings in the rest of the block range can always be rebuilt lazily. + * That assumption is correct for the host stage-2 with RWX mappings + * targeting memory or RW mappings targeting MMIO ranges (see + * host_stage2_idmap() below which implements some of the host memory + * abort logic). However, this is not safe for any other mappings where + * the host stage-2 page-table is in fact the only place where this + * state is stored. In all those cases, it is safer to use page-level + * mappings, hence avoiding to lose the state because of side-effects in + * kvm_pgtable_stage2_map(). + */ + if (range_is_memory(addr, end)) + return prot != PKVM_HOST_MEM_PROT; + else + return prot != PKVM_HOST_MMIO_PROT; +} + +static int host_stage2_idmap(u64 addr) +{ + struct kvm_mem_range range; + bool is_memory = !!find_mem_range(addr, &range); + enum kvm_pgtable_prot prot; + int ret; + + prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT; + + host_lock_component(); + ret = host_stage2_adjust_range(addr, &range); + if (ret) + goto unlock; + + ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot); +unlock: + host_unlock_component(); + + return ret; +} + +void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) +{ + struct kvm_vcpu_fault_info fault; + u64 esr, addr; + int ret = 0; + + esr = read_sysreg_el2(SYS_ESR); + if (!__get_fault_info(esr, &fault)) { + /* + * We've presumably raced with a page-table change which caused + * AT to fail, try again. + */ + return; + } + + + /* + * Yikes, we couldn't resolve the fault IPA. This should reinject an + * abort into the host when we figure out how to do that. + */ + BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS)); + addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12; + + ret = host_stage2_idmap(addr); + BUG_ON(ret && ret != -EAGAIN); +} + +struct check_walk_data { + enum pkvm_page_state desired; + enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr); +}; + +static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct check_walk_data *d = ctx->arg; + + return d->get_page_state(ctx->old, ctx->addr) == d->desired ? 0 : -EPERM; +} + +static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct check_walk_data *data) +{ + struct kvm_pgtable_walker walker = { + .cb = __check_page_state_visitor, + .arg = data, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + +static int __host_check_page_state_range(u64 addr, u64 size, + enum pkvm_page_state state) +{ + int ret; + + ret = check_range_allowed_memory(addr, addr + size); + if (ret) + return ret; + + hyp_assert_lock_held(&host_mmu.lock); + + for_each_hyp_page(page, addr, size) { + if (get_host_state(page) != state) + return -EPERM; + } + + return 0; +} + +static int __host_set_page_state_range(u64 addr, u64 size, + enum pkvm_page_state state) +{ + if (get_host_state(hyp_phys_to_page(addr)) == PKVM_NOPAGE) { + int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); + + if (ret) + return ret; + } + + __host_update_page_state(addr, size, state); + + return 0; +} + +static void __hyp_set_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) +{ + for_each_hyp_page(page, phys, size) + set_hyp_state(page, state); +} + +static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) +{ + for_each_hyp_page(page, phys, size) { + if (get_hyp_state(page) != state) + return -EPERM; + } + + return 0; +} + +static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) +{ + if (!kvm_pte_valid(pte)) + return PKVM_NOPAGE; + + return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); +} + +static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr, + u64 size, enum pkvm_page_state state) +{ + struct check_walk_data d = { + .desired = state, + .get_page_state = guest_get_page_state, + }; + + hyp_assert_lock_held(&vm->lock); + return check_page_state_range(&vm->pgt, addr, size, &d); +} + +int __pkvm_host_share_hyp(u64 pfn) +{ + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE; + int ret; + + host_lock_component(); + hyp_lock_component(); + + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; + + __hyp_set_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_unshare_hyp(u64 pfn) +{ + u64 phys = hyp_pfn_to_phys(pfn); + u64 virt = (u64)__hyp_va(phys); + u64 size = PAGE_SIZE; + int ret; + + host_lock_component(); + hyp_lock_component(); + + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + if (hyp_page_count((void *)virt)) { + ret = -EBUSY; + goto unlock; + } + + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) +{ + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + void *virt = __hyp_va(phys); + int ret; + + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + + host_lock_component(); + hyp_lock_component(); + + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; + + __hyp_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, PAGE_HYP)); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) +{ + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + u64 virt = (u64)__hyp_va(phys); + int ret; + + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + + host_lock_component(); + hyp_lock_component(); + + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; + + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int hyp_pin_shared_mem(void *from, void *to) +{ + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + u64 phys = __hyp_pa(start); + u64 size = end - start; + struct hyp_page *p; + int ret; + + host_lock_component(); + hyp_lock_component(); + + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + hyp_page_ref_inc(p); + if (p->refcount == 1) + WARN_ON(pkvm_create_mappings_locked((void *)cur, + (void *)cur + PAGE_SIZE, + PAGE_HYP)); + } + +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +void hyp_unpin_shared_mem(void *from, void *to) +{ + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + struct hyp_page *p; + + host_lock_component(); + hyp_lock_component(); + + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + if (p->refcount == 1) + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, cur, PAGE_SIZE) != PAGE_SIZE); + hyp_page_ref_dec(p); + } + + hyp_unlock_component(); + host_unlock_component(); +} + +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) +{ + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + int ret; + + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) +{ + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + int ret; + + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + host_unlock_component(); + + return ret; +} + +static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *size) +{ + size_t block_size; + + if (nr_pages == 1) { + *size = PAGE_SIZE; + return 0; + } + + /* We solely support second to last level huge mapping */ + block_size = kvm_granule_size(KVM_PGTABLE_LAST_LEVEL - 1); + + if (nr_pages != block_size >> PAGE_SHIFT) + return -EINVAL; + + if (!IS_ALIGNED(phys | ipa, block_size)) + return -EINVAL; + + *size = block_size; + return 0; +} + +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys = hyp_pfn_to_phys(pfn); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 size; + int ret; + + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + + ret = __guest_check_transition_size(phys, ipa, nr_pages, &size); + if (ret) + return ret; + + ret = check_range_allowed_memory(phys, phys + size); + if (ret) + return ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __guest_check_page_state_range(vm, ipa, size, PKVM_NOPAGE); + if (ret) + goto unlock; + + for_each_hyp_page(page, phys, size) { + switch (get_host_state(page)) { + case PKVM_PAGE_OWNED: + continue; + case PKVM_PAGE_SHARED_OWNED: + if (page->host_share_guest_count == U32_MAX) { + ret = -EBUSY; + goto unlock; + } + + /* Only host to np-guest multi-sharing is tolerated */ + if (page->host_share_guest_count) + continue; + + fallthrough; + default: + ret = -EPERM; + goto unlock; + } + } + + for_each_hyp_page(page, phys, size) { + set_host_state(page, PKVM_PAGE_SHARED_OWNED); + page->host_share_guest_count++; + } + + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, size, phys, + pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa, u64 size) +{ + enum pkvm_page_state state; + kvm_pte_t pte; + u64 phys; + s8 level; + int ret; + + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); + if (ret) + return ret; + if (!kvm_pte_valid(pte)) + return -ENOENT; + if (size && kvm_granule_size(level) != size) + return -E2BIG; + + if (!size) + size = kvm_granule_size(level); + + state = guest_get_page_state(pte, ipa); + if (state != PKVM_PAGE_SHARED_BORROWED) + return -EPERM; + + phys = kvm_pte_to_phys(pte); + ret = check_range_allowed_memory(phys, phys + size); + if (WARN_ON(ret)) + return ret; + + for_each_hyp_page(page, phys, size) { + if (get_host_state(page) != PKVM_PAGE_SHARED_OWNED) + return -EPERM; + if (WARN_ON(!page->host_share_guest_count)) + return -EINVAL; + } + + *__phys = phys; + + return 0; +} + +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) +{ + u64 ipa = hyp_pfn_to_phys(gfn); + u64 size, phys; + int ret; + + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __check_host_shared_guest(vm, &phys, ipa, size); + if (ret) + goto unlock; + + ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, size); + if (ret) + goto unlock; + + for_each_hyp_page(page, phys, size) { + /* __check_host_shared_guest() protects against underflow */ + page->host_share_guest_count--; + if (!page->host_share_guest_count) + set_host_state(page, PKVM_PAGE_OWNED); + } + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa, u64 size) +{ + u64 phys; + int ret; + + if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) + return; + + host_lock_component(); + guest_lock_component(vm); + + ret = __check_host_shared_guest(vm, &phys, ipa, size); + + guest_unlock_component(vm); + host_unlock_component(); + + WARN_ON(ret && ret != -ENOENT); +} + +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); + int ret; + + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + + assert_host_shared_guest(vm, ipa, 0); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); + guest_unlock_component(vm); + + return ret; +} + +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) +{ + u64 size, ipa = hyp_pfn_to_phys(gfn); + int ret; + + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; + + assert_host_shared_guest(vm, ipa, size); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, size); + guest_unlock_component(vm); + + return ret; +} + +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm) +{ + u64 size, ipa = hyp_pfn_to_phys(gfn); + int ret; + + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; + + assert_host_shared_guest(vm, ipa, size); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, size, mkold); + guest_unlock_component(vm); + + return ret; +} + +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); + + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + assert_host_shared_guest(vm, ipa, 0); + guest_lock_component(vm); + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); + guest_unlock_component(vm); + + return 0; +} + +#ifdef CONFIG_NVHE_EL2_DEBUG +struct pkvm_expected_state { + enum pkvm_page_state host; + enum pkvm_page_state hyp; + enum pkvm_page_state guest[2]; /* [ gfn, gfn + 1 ] */ +}; + +static struct pkvm_expected_state selftest_state; +static struct hyp_page *selftest_page; + +static struct pkvm_hyp_vm selftest_vm = { + .kvm = { + .arch = { + .mmu = { + .arch = &selftest_vm.kvm.arch, + .pgt = &selftest_vm.pgt, + }, + }, + }, +}; + +static struct pkvm_hyp_vcpu selftest_vcpu = { + .vcpu = { + .arch = { + .hw_mmu = &selftest_vm.kvm.arch.mmu, + }, + .kvm = &selftest_vm.kvm, + }, +}; + +static void init_selftest_vm(void *virt) +{ + struct hyp_page *p = hyp_virt_to_page(virt); + int i; + + selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + + for (i = 0; i < pkvm_selftest_pages(); i++) { + if (p[i].refcount) + continue; + p[i].refcount = 1; + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } +} + +static u64 selftest_ipa(void) +{ + return BIT(selftest_vm.pgt.ia_bits - 1); +} + +static void assert_page_state(void) +{ + void *virt = hyp_page_to_virt(selftest_page); + u64 size = PAGE_SIZE << selftest_page->order; + struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; + u64 phys = hyp_virt_to_phys(virt); + u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE }; + struct pkvm_hyp_vm *vm; + + vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + + host_lock_component(); + WARN_ON(__host_check_page_state_range(phys, size, selftest_state.host)); + host_unlock_component(); + + hyp_lock_component(); + WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp)); + hyp_unlock_component(); + + guest_lock_component(&selftest_vm); + WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0])); + WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1])); + guest_unlock_component(&selftest_vm); +} + +#define assert_transition_res(res, fn, ...) \ + do { \ + WARN_ON(fn(__VA_ARGS__) != res); \ + assert_page_state(); \ + } while (0) + +void pkvm_ownership_selftest(void *base) +{ + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX; + void *virt = hyp_alloc_pages(&host_s2_pool, 0); + struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; + struct pkvm_hyp_vm *vm = &selftest_vm; + u64 phys, size, pfn, gfn; + + WARN_ON(!virt); + selftest_page = hyp_virt_to_page(virt); + selftest_page->refcount = 0; + init_selftest_vm(base); + + size = PAGE_SIZE << selftest_page->order; + phys = hyp_virt_to_phys(virt); + pfn = hyp_phys_to_pfn(phys); + gfn = hyp_phys_to_pfn(selftest_ipa()); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + selftest_state.guest[0] = selftest_state.guest[1] = PKVM_NOPAGE; + assert_page_state(); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + hyp_unpin_shared_mem(virt, virt + size); + WARN_ON(hyp_page_count(virt) != 1); + assert_transition_res(-EBUSY, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + hyp_unpin_shared_mem(virt, virt + size); + assert_page_state(); + WARN_ON(hyp_page_count(virt)); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_hyp, pfn); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.guest[0] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + WARN_ON(hyp_virt_to_page(virt)->host_share_guest_count != 2); + + selftest_state.guest[0] = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn, 1, vm); + + selftest_state.guest[1] = PKVM_NOPAGE; + selftest_state.host = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1); + + selftest_page->refcount = 1; + hyp_put_page(&host_s2_pool, virt); +} +#endif diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c new file mode 100644 index 000000000000..ae8391baebc3 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -0,0 +1,504 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> +#include <asm/spectre.h> + +#include <nvhe/early_alloc.h> +#include <nvhe/gfp.h> +#include <nvhe/memory.h> +#include <nvhe/mem_protect.h> +#include <nvhe/mm.h> +#include <nvhe/spinlock.h> + +struct kvm_pgtable pkvm_pgtable; +hyp_spinlock_t pkvm_pgd_lock; + +struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; +unsigned int hyp_memblock_nr; + +static u64 __io_map_base; + +struct hyp_fixmap_slot { + u64 addr; + kvm_pte_t *ptep; +}; +static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots); + +static int __pkvm_create_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) +{ + int err; + + hyp_spin_lock(&pkvm_pgd_lock); + err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot); + hyp_spin_unlock(&pkvm_pgd_lock); + + return err; +} + +static int __pkvm_alloc_private_va_range(unsigned long start, size_t size) +{ + unsigned long cur; + + hyp_assert_lock_held(&pkvm_pgd_lock); + + if (!start || start < __io_map_base) + return -EINVAL; + + /* The allocated size is always a multiple of PAGE_SIZE */ + cur = start + PAGE_ALIGN(size); + + /* Are we overflowing on the vmemmap ? */ + if (cur > __hyp_vmemmap) + return -ENOMEM; + + __io_map_base = cur; + + return 0; +} + +/** + * pkvm_alloc_private_va_range - Allocates a private VA range. + * @size: The size of the VA range to reserve. + * @haddr: The hypervisor virtual start address of the allocation. + * + * The private virtual address (VA) range is allocated above __io_map_base + * and aligned based on the order of @size. + * + * Return: 0 on success or negative error code on failure. + */ +int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr) +{ + unsigned long addr; + int ret; + + hyp_spin_lock(&pkvm_pgd_lock); + addr = __io_map_base; + ret = __pkvm_alloc_private_va_range(addr, size); + hyp_spin_unlock(&pkvm_pgd_lock); + + *haddr = addr; + + return ret; +} + +int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, + enum kvm_pgtable_prot prot, + unsigned long *haddr) +{ + unsigned long addr; + int err; + + size = PAGE_ALIGN(size + offset_in_page(phys)); + err = pkvm_alloc_private_va_range(size, &addr); + if (err) + return err; + + err = __pkvm_create_mappings(addr, size, phys, prot); + if (err) + return err; + + *haddr = addr + offset_in_page(phys); + return err; +} + +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot) +{ + unsigned long start = (unsigned long)from; + unsigned long end = (unsigned long)to; + unsigned long virt_addr; + phys_addr_t phys; + + hyp_assert_lock_held(&pkvm_pgd_lock); + + start = start & PAGE_MASK; + end = PAGE_ALIGN(end); + + for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { + int err; + + phys = hyp_virt_to_phys((void *)virt_addr); + err = kvm_pgtable_hyp_map(&pkvm_pgtable, virt_addr, PAGE_SIZE, + phys, prot); + if (err) + return err; + } + + return 0; +} + +int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +{ + int ret; + + hyp_spin_lock(&pkvm_pgd_lock); + ret = pkvm_create_mappings_locked(from, to, prot); + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +} + +int hyp_back_vmemmap(phys_addr_t back) +{ + unsigned long i, start, size, end = 0; + int ret; + + for (i = 0; i < hyp_memblock_nr; i++) { + start = hyp_memory[i].base; + start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE); + /* + * The beginning of the hyp_vmemmap region for the current + * memblock may already be backed by the page backing the end + * the previous region, so avoid mapping it twice. + */ + start = max(start, end); + + end = hyp_memory[i].base + hyp_memory[i].size; + end = PAGE_ALIGN((u64)hyp_phys_to_page(end)); + if (start >= end) + continue; + + size = end - start; + ret = __pkvm_create_mappings(start, size, back, PAGE_HYP); + if (ret) + return ret; + + memset(hyp_phys_to_virt(back), 0, size); + back += size; + } + + return 0; +} + +static void *__hyp_bp_vect_base; +int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot) +{ + void *vector; + + switch (slot) { + case HYP_VECTOR_DIRECT: { + vector = __kvm_hyp_vector; + break; + } + case HYP_VECTOR_SPECTRE_DIRECT: { + vector = __bp_harden_hyp_vecs; + break; + } + case HYP_VECTOR_INDIRECT: + case HYP_VECTOR_SPECTRE_INDIRECT: { + vector = (void *)__hyp_bp_vect_base; + break; + } + default: + return -EINVAL; + } + + vector = __kvm_vector_slot2addr(vector, slot); + *this_cpu_ptr(&kvm_hyp_vector) = (unsigned long)vector; + + return 0; +} + +int hyp_map_vectors(void) +{ + phys_addr_t phys; + unsigned long bp_base; + int ret; + + if (!kvm_system_needs_idmapped_vectors()) { + __hyp_bp_vect_base = __bp_harden_hyp_vecs; + return 0; + } + + phys = __hyp_pa(__bp_harden_hyp_vecs); + ret = __pkvm_create_private_mapping(phys, __BP_HARDEN_HYP_VECS_SZ, + PAGE_HYP_EXEC, &bp_base); + if (ret) + return ret; + + __hyp_bp_vect_base = (void *)bp_base; + + return 0; +} + +static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys) +{ + kvm_pte_t pte, *ptep = slot->ptep; + + pte = *ptep; + pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID); + pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID; + WRITE_ONCE(*ptep, pte); + dsb(ishst); + + return (void *)slot->addr; +} + +void *hyp_fixmap_map(phys_addr_t phys) +{ + return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys); +} + +static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) +{ + kvm_pte_t *ptep = slot->ptep; + u64 addr = slot->addr; + u32 level; + + if (FIELD_GET(KVM_PTE_TYPE, *ptep) == KVM_PTE_TYPE_PAGE) + level = KVM_PGTABLE_LAST_LEVEL; + else + level = KVM_PGTABLE_LAST_LEVEL - 1; /* create_fixblock() guarantees PMD level */ + + WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID); + + /* + * Irritatingly, the architecture requires that we use inner-shareable + * broadcast TLB invalidation here in case another CPU speculates + * through our fixmap and decides to create an "amalagamation of the + * values held in the TLB" due to the apparent lack of a + * break-before-make sequence. + * + * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 + */ + dsb(ishst); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); + dsb(ish); + isb(); +} + +void hyp_fixmap_unmap(void) +{ + fixmap_clear_slot(this_cpu_ptr(&fixmap_slots)); +} + +static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct hyp_fixmap_slot *slot = (struct hyp_fixmap_slot *)ctx->arg; + + if (!kvm_pte_valid(ctx->old) || (ctx->end - ctx->start) != kvm_granule_size(ctx->level)) + return -EINVAL; + + slot->addr = ctx->addr; + slot->ptep = ctx->ptep; + + /* + * Clear the PTE, but keep the page-table page refcount elevated to + * prevent it from ever being freed. This lets us manipulate the PTEs + * by hand safely without ever needing to allocate memory. + */ + fixmap_clear_slot(slot); + + return 0; +} + +static int create_fixmap_slot(u64 addr, u64 cpu) +{ + struct kvm_pgtable_walker walker = { + .cb = __create_fixmap_slot_cb, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = per_cpu_ptr(&fixmap_slots, cpu), + }; + + return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker); +} + +#if PAGE_SHIFT < 16 +#define HAS_FIXBLOCK +static struct hyp_fixmap_slot hyp_fixblock_slot; +static DEFINE_HYP_SPINLOCK(hyp_fixblock_lock); +#endif + +static int create_fixblock(void) +{ +#ifdef HAS_FIXBLOCK + struct kvm_pgtable_walker walker = { + .cb = __create_fixmap_slot_cb, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &hyp_fixblock_slot, + }; + unsigned long addr; + phys_addr_t phys; + int ret, i; + + /* Find a RAM phys address, PMD aligned */ + for (i = 0; i < hyp_memblock_nr; i++) { + phys = ALIGN(hyp_memory[i].base, PMD_SIZE); + if (phys + PMD_SIZE < (hyp_memory[i].base + hyp_memory[i].size)) + break; + } + + if (i >= hyp_memblock_nr) + return -EINVAL; + + hyp_spin_lock(&pkvm_pgd_lock); + addr = ALIGN(__io_map_base, PMD_SIZE); + ret = __pkvm_alloc_private_va_range(addr, PMD_SIZE); + if (ret) + goto unlock; + + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PMD_SIZE, phys, PAGE_HYP); + if (ret) + goto unlock; + + ret = kvm_pgtable_walk(&pkvm_pgtable, addr, PMD_SIZE, &walker); + +unlock: + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +#else + return 0; +#endif +} + +void *hyp_fixblock_map(phys_addr_t phys, size_t *size) +{ +#ifdef HAS_FIXBLOCK + *size = PMD_SIZE; + hyp_spin_lock(&hyp_fixblock_lock); + return fixmap_map_slot(&hyp_fixblock_slot, phys); +#else + *size = PAGE_SIZE; + return hyp_fixmap_map(phys); +#endif +} + +void hyp_fixblock_unmap(void) +{ +#ifdef HAS_FIXBLOCK + fixmap_clear_slot(&hyp_fixblock_slot); + hyp_spin_unlock(&hyp_fixblock_lock); +#else + hyp_fixmap_unmap(); +#endif +} + +int hyp_create_fixmap(void) +{ + unsigned long addr, i; + int ret; + + for (i = 0; i < hyp_nr_cpus; i++) { + ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr); + if (ret) + return ret; + + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE, + __hyp_pa(__hyp_bss_start), PAGE_HYP); + if (ret) + return ret; + + ret = create_fixmap_slot(addr, i); + if (ret) + return ret; + } + + return create_fixblock(); +} + +int hyp_create_idmap(u32 hyp_va_bits) +{ + unsigned long start, end; + + start = hyp_virt_to_phys((void *)__hyp_idmap_text_start); + start = ALIGN_DOWN(start, PAGE_SIZE); + + end = hyp_virt_to_phys((void *)__hyp_idmap_text_end); + end = ALIGN(end, PAGE_SIZE); + + /* + * One half of the VA space is reserved to linearly map portions of + * memory -- see va_layout.c for more details. The other half of the VA + * space contains the trampoline page, and needs some care. Split that + * second half in two and find the quarter of VA space not conflicting + * with the idmap to place the IOs and the vmemmap. IOs use the lower + * half of the quarter and the vmemmap the upper half. + */ + __io_map_base = start & BIT(hyp_va_bits - 2); + __io_map_base ^= BIT(hyp_va_bits - 2); + __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3); + + return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC); +} + +int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) +{ + unsigned long addr, prev_base; + size_t size; + int ret; + + hyp_spin_lock(&pkvm_pgd_lock); + + prev_base = __io_map_base; + /* + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies + * an alignment of our allocation on the order of the size. + */ + size = NVHE_STACK_SIZE * 2; + addr = ALIGN(__io_map_base, size); + + ret = __pkvm_alloc_private_va_range(addr, size); + if (!ret) { + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. + */ + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + NVHE_STACK_SIZE, + NVHE_STACK_SIZE, phys, PAGE_HYP); + if (ret) + __io_map_base = prev_base; + } + hyp_spin_unlock(&pkvm_pgd_lock); + + *haddr = addr + size; + + return ret; +} + +static void *admit_host_page(void *arg) +{ + struct kvm_hyp_memcache *host_mc = arg; + + if (!host_mc->nr_pages) + return NULL; + + /* + * The host still owns the pages in its memcache, so we need to go + * through a full host-to-hyp donation cycle to change it. Fortunately, + * __pkvm_host_donate_hyp() takes care of races for us, so if it + * succeeds we're good to go. + */ + if (__pkvm_host_donate_hyp(hyp_phys_to_pfn(host_mc->head), 1)) + return NULL; + + return pop_hyp_memcache(host_mc, hyp_phys_to_virt); +} + +/* Refill our local memcache by popping pages from the one provided by the host. */ +int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, + struct kvm_hyp_memcache *host_mc) +{ + struct kvm_hyp_memcache tmp = *host_mc; + int ret; + + ret = __topup_hyp_memcache(mc, min_pages, admit_host_page, + hyp_virt_to_phys, &tmp); + *host_mc = tmp; + + return ret; +} diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c new file mode 100644 index 000000000000..a1eb27a1a747 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <asm/kvm_hyp.h> +#include <nvhe/gfp.h> + +u64 __hyp_vmemmap; + +/* + * Index the hyp_vmemmap to find a potential buddy page, but make no assumption + * about its current state. + * + * Example buddy-tree for a 4-pages physically contiguous pool: + * + * o : Page 3 + * / + * o-o : Page 2 + * / + * / o : Page 1 + * / / + * o---o-o : Page 0 + * Order 2 1 0 + * + * Example of requests on this pool: + * __find_buddy_nocheck(pool, page 0, order 0) => page 1 + * __find_buddy_nocheck(pool, page 0, order 1) => page 2 + * __find_buddy_nocheck(pool, page 1, order 0) => page 0 + * __find_buddy_nocheck(pool, page 2, order 0) => page 3 + */ +static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, + struct hyp_page *p, + u8 order) +{ + phys_addr_t addr = hyp_page_to_phys(p); + + addr ^= (PAGE_SIZE << order); + + /* + * Don't return a page outside the pool range -- it belongs to + * something else and may not be mapped in hyp_vmemmap. + */ + if (addr < pool->range_start || addr >= pool->range_end) + return NULL; + + return hyp_phys_to_page(addr); +} + +/* Find a buddy page currently available for allocation */ +static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, + struct hyp_page *p, + u8 order) +{ + struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); + + if (!buddy || buddy->order != order || buddy->refcount) + return NULL; + + return buddy; + +} + +/* + * Pages that are available for allocation are tracked in free-lists, so we use + * the pages themselves to store the list nodes to avoid wasting space. As the + * allocator always returns zeroed pages (which are zeroed on the hyp_put_page() + * path to optimize allocation speed), we also need to clean-up the list node in + * each page when we take it out of the list. + */ +static inline void page_remove_from_list(struct hyp_page *p) +{ + struct list_head *node = hyp_page_to_virt(p); + + __list_del_entry(node); + memset(node, 0, sizeof(*node)); +} + +static inline void page_add_to_list(struct hyp_page *p, struct list_head *head) +{ + struct list_head *node = hyp_page_to_virt(p); + + INIT_LIST_HEAD(node); + list_add_tail(node, head); +} + +static inline struct hyp_page *node_to_page(struct list_head *node) +{ + return hyp_virt_to_page(node); +} + +static void __hyp_attach_page(struct hyp_pool *pool, + struct hyp_page *p) +{ + phys_addr_t phys = hyp_page_to_phys(p); + u8 order = p->order; + struct hyp_page *buddy; + + memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); + + /* Skip coalescing for 'external' pages being freed into the pool. */ + if (phys < pool->range_start || phys >= pool->range_end) + goto insert; + + /* + * Only the first struct hyp_page of a high-order page (otherwise known + * as the 'head') should have p->order set. The non-head pages should + * have p->order = HYP_NO_ORDER. Here @p may no longer be the head + * after coalescing, so make sure to mark it HYP_NO_ORDER proactively. + */ + p->order = HYP_NO_ORDER; + for (; (order + 1) <= pool->max_order; order++) { + buddy = __find_buddy_avail(pool, p, order); + if (!buddy) + break; + + /* Take the buddy out of its list, and coalesce with @p */ + page_remove_from_list(buddy); + buddy->order = HYP_NO_ORDER; + p = min(p, buddy); + } + +insert: + /* Mark the new head, and insert it */ + p->order = order; + page_add_to_list(p, &pool->free_area[order]); +} + +static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, + struct hyp_page *p, + u8 order) +{ + struct hyp_page *buddy; + + page_remove_from_list(p); + while (p->order > order) { + /* + * The buddy of order n - 1 currently has HYP_NO_ORDER as it + * is covered by a higher-level page (whose head is @p). Use + * __find_buddy_nocheck() to find it and inject it in the + * free_list[n - 1], effectively splitting @p in half. + */ + p->order--; + buddy = __find_buddy_nocheck(pool, p, p->order); + buddy->order = p->order; + page_add_to_list(buddy, &pool->free_area[buddy->order]); + } + + return p; +} + +static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p) +{ + if (hyp_page_ref_dec_and_test(p)) + __hyp_attach_page(pool, p); +} + +/* + * Changes to the buddy tree and page refcounts must be done with the hyp_pool + * lock held. If a refcount change requires an update to the buddy tree (e.g. + * hyp_put_page()), both operations must be done within the same critical + * section to guarantee transient states (e.g. a page with null refcount but + * not yet attached to a free list) can't be observed by well-behaved readers. + */ +void hyp_put_page(struct hyp_pool *pool, void *addr) +{ + struct hyp_page *p = hyp_virt_to_page(addr); + + hyp_spin_lock(&pool->lock); + __hyp_put_page(pool, p); + hyp_spin_unlock(&pool->lock); +} + +void hyp_get_page(struct hyp_pool *pool, void *addr) +{ + struct hyp_page *p = hyp_virt_to_page(addr); + + hyp_spin_lock(&pool->lock); + hyp_page_ref_inc(p); + hyp_spin_unlock(&pool->lock); +} + +void hyp_split_page(struct hyp_page *p) +{ + u8 order = p->order; + unsigned int i; + + p->order = 0; + for (i = 1; i < (1 << order); i++) { + struct hyp_page *tail = p + i; + + tail->order = 0; + hyp_set_page_refcounted(tail); + } +} + +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order) +{ + struct hyp_page *p; + u8 i = order; + + hyp_spin_lock(&pool->lock); + + /* Look for a high-enough-order page */ + while (i <= pool->max_order && list_empty(&pool->free_area[i])) + i++; + if (i > pool->max_order) { + hyp_spin_unlock(&pool->lock); + return NULL; + } + + /* Extract it from the tree at the right order */ + p = node_to_page(pool->free_area[i].next); + p = __hyp_extract_page(pool, p, order); + + hyp_set_page_refcounted(p); + hyp_spin_unlock(&pool->lock); + + return hyp_page_to_virt(p); +} + +int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, + unsigned int reserved_pages) +{ + phys_addr_t phys = hyp_pfn_to_phys(pfn); + struct hyp_page *p; + int i; + + hyp_spin_lock_init(&pool->lock); + pool->max_order = min(MAX_PAGE_ORDER, + get_order(nr_pages << PAGE_SHIFT)); + for (i = 0; i <= pool->max_order; i++) + INIT_LIST_HEAD(&pool->free_area[i]); + pool->range_start = phys; + pool->range_end = phys + (nr_pages << PAGE_SHIFT); + + /* Init the vmemmap portion */ + p = hyp_phys_to_page(phys); + for (i = 0; i < nr_pages; i++) + hyp_set_page_refcounted(&p[i]); + + /* Attach the unused pages to the buddy tree */ + for (i = reserved_pages; i < nr_pages; i++) + __hyp_put_page(pool, &p[i]); + + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c new file mode 100644 index 000000000000..8911338961c5 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -0,0 +1,894 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#include <linux/kvm_host.h> +#include <linux/mm.h> + +#include <asm/kvm_emulate.h> + +#include <nvhe/mem_protect.h> +#include <nvhe/memory.h> +#include <nvhe/pkvm.h> +#include <nvhe/trap_handler.h> + +/* Used by icache_is_aliasing(). */ +unsigned long __icache_flags; + +/* Used by kvm_get_vttbr(). */ +unsigned int kvm_arm_vmid_bits; + +unsigned int kvm_host_sve_max_vl; + +/* + * The currently loaded hyp vCPU for each physical CPU. Used in protected mode + * for both protected and non-protected VMs. + */ +static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); + +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; + + if (has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; + + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; +} + +static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.hcr_el2; + + /* No support for AArch32. */ + val |= HCR_RW; + + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; + + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { + val |= HCR_TERR | HCR_TEA; + val &= ~(HCR_FIEN); + } + + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) + val &= ~(HCR_AMVOFFEN); + + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) { + val |= HCR_TID5; + val &= ~(HCR_DCT | HCR_ATA); + } + + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) + val |= HCR_TLOR; + + vcpu->arch.hcr_el2 = val; +} + +static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.mdcr_el2; + + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) { + val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); + } + + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP)) + val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; + + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) + val |= MDCR_EL2_TDOSA; + + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) { + val |= MDCR_EL2_TPMS; + val &= ~MDCR_EL2_E2PB_MASK; + } + + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + val |= MDCR_EL2_TTRF; + + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP)) + val |= MDCR_EL2_E2TB_MASK; + + /* Trap Debug Communications Channel registers */ + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + val |= MDCR_EL2_TDCC; + + vcpu->arch.mdcr_el2 = val; +} + +/* + * Check that cpu features that are neither trapped nor supported are not + * enabled for protected VMs. + */ +static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + /* No AArch32 support for protected guests. */ + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) + return -EINVAL; + + /* + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. + */ + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) || + !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP)) + return -EINVAL; + + /* No SME support in KVM right now. Check to catch if it changes. */ + if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) + return -EINVAL; + + return 0; +} + +/* + * Initialize trap register values in protected mode. + */ +static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + int ret; + + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); + + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) { + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + /* Trust the host for non-protected vcpu features. */ + vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2; + memcpy(vcpu->arch.fgt, host_vcpu->arch.fgt, sizeof(vcpu->arch.fgt)); + return 0; + } + + ret = pkvm_check_pvm_cpu_features(vcpu); + if (ret) + return ret; + + pvm_init_traps_hcr(vcpu); + pvm_init_traps_mdcr(vcpu); + vcpu_set_hcrx(vcpu); + + return 0; +} + +/* + * Start the VM table handle at the offset defined instead of at 0. + * Mainly for sanity checking and debugging. + */ +#define HANDLE_OFFSET 0x1000 + +/* + * Marks a reserved but not yet used entry in the VM table. + */ +#define RESERVED_ENTRY ((void *)0xa110ca7ed) + +static unsigned int vm_handle_to_idx(pkvm_handle_t handle) +{ + return handle - HANDLE_OFFSET; +} + +static pkvm_handle_t idx_to_vm_handle(unsigned int idx) +{ + return idx + HANDLE_OFFSET; +} + +/* + * Spinlock for protecting state related to the VM table. Protects writes + * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization. + * Also protects reads and writes to 'last_hyp_vcpu_lookup'. + */ +DEFINE_HYP_SPINLOCK(vm_table_lock); + +/* + * A table that tracks all VMs in protected mode. + * Allocated during hyp initialization and setup. + */ +static struct pkvm_hyp_vm **vm_table; + +void pkvm_hyp_vm_table_init(void *tbl) +{ + WARN_ON(vm_table); + vm_table = tbl; +} + +/* + * Return the hyp vm structure corresponding to the handle. + */ +static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) +{ + unsigned int idx = vm_handle_to_idx(handle); + + if (unlikely(idx >= KVM_MAX_PVMS)) + return NULL; + + /* A reserved entry doesn't represent an initialized VM. */ + if (unlikely(vm_table[idx] == RESERVED_ENTRY)) + return NULL; + + return vm_table[idx]; +} + +struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, + unsigned int vcpu_idx) +{ + struct pkvm_hyp_vcpu *hyp_vcpu = NULL; + struct pkvm_hyp_vm *hyp_vm; + + /* Cannot load a new vcpu without putting the old one first. */ + if (__this_cpu_read(loaded_hyp_vcpu)) + return NULL; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx) + goto unlock; + + hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + if (!hyp_vcpu) + goto unlock; + + /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ + if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { + hyp_vcpu = NULL; + goto unlock; + } + + hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu); + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); +unlock: + hyp_spin_unlock(&vm_table_lock); + + if (hyp_vcpu) + __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu); + return hyp_vcpu; +} + +void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + + hyp_spin_lock(&vm_table_lock); + hyp_vcpu->loaded_hyp_vcpu = NULL; + __this_cpu_write(loaded_hyp_vcpu, NULL); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void) +{ + return __this_cpu_read(loaded_hyp_vcpu); + +} + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (hyp_vm) + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); + + return hyp_vm; +} + +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm) +{ + hyp_spin_lock(&vm_table_lock); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); + + if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) { + put_pkvm_hyp_vm(hyp_vm); + hyp_vm = NULL; + } + + return hyp_vm; +} + +static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) +{ + struct kvm *kvm = &hyp_vm->kvm; + unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); + DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* CTR_EL0 is always under host control, even for protected VMs. */ + hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; + + /* Preserve the vgic model so that GICv3 emulation works */ + hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model; + + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) + set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); + + /* No restrictions for non-protected VMs. */ + if (!kvm_vm_is_protected(kvm)) { + hyp_vm->kvm.arch.flags = host_arch_flags; + + bitmap_copy(kvm->arch.vcpu_features, + host_kvm->arch.vcpu_features, + KVM_VCPU_MAX_FEATURES); + + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags)) + hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1; + + return; + } + + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); + + set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) + set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) + set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) + set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) { + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE); + } + + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, + allowed_features, KVM_VCPU_MAX_FEATURES); +} + +static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) +{ + if (host_vcpu) + hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); +} + +static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + void *sve_state; + + if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE)) + return; + + sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state); + hyp_unpin_shared_mem(sve_state, + sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu)); +} + +static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], + unsigned int nr_vcpus) +{ + int i; + + for (i = 0; i < nr_vcpus; i++) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i]; + + if (!hyp_vcpu) + continue; + + unpin_host_vcpu(hyp_vcpu->host_vcpu); + unpin_host_sve_state(hyp_vcpu); + } +} + +static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, + unsigned int nr_vcpus, pkvm_handle_t handle) +{ + struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; + int idx = vm_handle_to_idx(handle); + + hyp_vm->kvm.arch.pkvm.handle = handle; + + hyp_vm->host_kvm = host_kvm; + hyp_vm->kvm.created_vcpus = nr_vcpus; + hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected); + hyp_vm->kvm.arch.pkvm.is_created = true; + hyp_vm->kvm.arch.flags = 0; + pkvm_init_features_from_host(hyp_vm, host_kvm); + + /* VMID 0 is reserved for the host */ + atomic64_set(&mmu->vmid.id, idx + 1); + + mmu->vtcr = host_mmu.arch.mmu.vtcr; + mmu->arch = &hyp_vm->kvm.arch; + mmu->pgt = &hyp_vm->pgt; +} + +static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + unsigned int sve_max_vl; + size_t sve_state_size; + void *sve_state; + int ret = 0; + + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { + vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); + return 0; + } + + /* Limit guest vector length to the maximum supported by the host. */ + sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl); + sve_state_size = sve_state_size_from_vl(sve_max_vl); + sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state)); + + if (!sve_state || !sve_state_size) { + ret = -EINVAL; + goto err; + } + + ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size); + if (ret) + goto err; + + vcpu->arch.sve_state = sve_state; + vcpu->arch.sve_max_vl = sve_max_vl; + + return 0; +err: + clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features); + return ret; +} + +static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, + struct pkvm_hyp_vm *hyp_vm, + struct kvm_vcpu *host_vcpu) +{ + int ret = 0; + + if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) + return -EBUSY; + + hyp_vcpu->host_vcpu = host_vcpu; + + hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; + hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); + hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx); + + hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; + hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + + ret = pkvm_vcpu_init_traps(hyp_vcpu); + if (ret) + goto done; + + ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); +done: + if (ret) + unpin_host_vcpu(host_vcpu); + return ret; +} + +static int find_free_vm_table_entry(void) +{ + int i; + + for (i = 0; i < KVM_MAX_PVMS; ++i) { + if (!vm_table[i]) + return i; + } + + return -ENOMEM; +} + +/* + * Reserve a VM table entry. + * + * Return a unique handle to the VM on success, + * negative error code on failure. + */ +static int allocate_vm_table_entry(void) +{ + int idx; + + hyp_assert_lock_held(&vm_table_lock); + + /* + * Initializing protected state might have failed, yet a malicious + * host could trigger this function. Thus, ensure that 'vm_table' + * exists. + */ + if (unlikely(!vm_table)) + return -EINVAL; + + idx = find_free_vm_table_entry(); + if (unlikely(idx < 0)) + return idx; + + vm_table[idx] = RESERVED_ENTRY; + + return idx; +} + +static int __insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + unsigned int idx; + + hyp_assert_lock_held(&vm_table_lock); + + /* + * Initializing protected state might have failed, yet a malicious + * host could trigger this function. Thus, ensure that 'vm_table' + * exists. + */ + if (unlikely(!vm_table)) + return -EINVAL; + + idx = vm_handle_to_idx(handle); + if (unlikely(idx >= KVM_MAX_PVMS)) + return -EINVAL; + + if (unlikely(vm_table[idx] != RESERVED_ENTRY)) + return -EINVAL; + + vm_table[idx] = hyp_vm; + + return 0; +} + +/* + * Insert a pointer to the initialized VM into the VM table. + * + * Return 0 on success, or negative error code on failure. + */ +static int insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = __insert_vm_table_entry(handle, hyp_vm); + hyp_spin_unlock(&vm_table_lock); + + return ret; +} + +/* + * Deallocate and remove the VM table entry corresponding to the handle. + */ +static void remove_vm_table_entry(pkvm_handle_t handle) +{ + hyp_assert_lock_held(&vm_table_lock); + vm_table[vm_handle_to_idx(handle)] = NULL; +} + +static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus) +{ + return size_add(sizeof(struct pkvm_hyp_vm), + size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus)); +} + +static void *map_donated_memory_noclear(unsigned long host_va, size_t size) +{ + void *va = (void *)kern_hyp_va(host_va); + + if (!PAGE_ALIGNED(va)) + return NULL; + + if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va), + PAGE_ALIGN(size) >> PAGE_SHIFT)) + return NULL; + + return va; +} + +static void *map_donated_memory(unsigned long host_va, size_t size) +{ + void *va = map_donated_memory_noclear(host_va, size); + + if (va) + memset(va, 0, size); + + return va; +} + +static void __unmap_donated_memory(void *va, size_t size) +{ + kvm_flush_dcache_to_poc(va, size); + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va), + PAGE_ALIGN(size) >> PAGE_SHIFT)); +} + +static void unmap_donated_memory(void *va, size_t size) +{ + if (!va) + return; + + memset(va, 0, size); + __unmap_donated_memory(va, size); +} + +static void unmap_donated_memory_noclear(void *va, size_t size) +{ + if (!va) + return; + + __unmap_donated_memory(va, size); +} + +/* + * Reserves an entry in the hypervisor for a new VM in protected mode. + * + * Return a unique handle to the VM on success, negative error code on failure. + */ +int __pkvm_reserve_vm(void) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = allocate_vm_table_entry(); + hyp_spin_unlock(&vm_table_lock); + + if (ret < 0) + return ret; + + return idx_to_vm_handle(ret); +} + +/* + * Removes a reserved entry, but only if is hasn't been used yet. + * Otherwise, the VM needs to be destroyed. + */ +void __pkvm_unreserve_vm(pkvm_handle_t handle) +{ + unsigned int idx = vm_handle_to_idx(handle); + + if (unlikely(!vm_table)) + return; + + hyp_spin_lock(&vm_table_lock); + if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY)) + remove_vm_table_entry(handle); + hyp_spin_unlock(&vm_table_lock); +} + +/* + * Initialize the hypervisor copy of the VM state using host-donated memory. + * + * Unmap the donated memory from the host at stage 2. + * + * host_kvm: A pointer to the host's struct kvm. + * vm_hva: The host va of the area being donated for the VM state. + * Must be page aligned. + * pgd_hva: The host va of the area being donated for the stage-2 PGD for + * the VM. Must be page aligned. Its size is implied by the VM's + * VTCR. + * + * Return 0 success, negative error code on failure. + */ +int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, + unsigned long pgd_hva) +{ + struct pkvm_hyp_vm *hyp_vm = NULL; + size_t vm_size, pgd_size; + unsigned int nr_vcpus; + pkvm_handle_t handle; + void *pgd = NULL; + int ret; + + ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1); + if (ret) + return ret; + + nr_vcpus = READ_ONCE(host_kvm->created_vcpus); + if (nr_vcpus < 1) { + ret = -EINVAL; + goto err_unpin_kvm; + } + + handle = READ_ONCE(host_kvm->arch.pkvm.handle); + if (unlikely(handle < HANDLE_OFFSET)) { + ret = -EINVAL; + goto err_unpin_kvm; + } + + vm_size = pkvm_get_hyp_vm_size(nr_vcpus); + pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr); + + ret = -ENOMEM; + + hyp_vm = map_donated_memory(vm_hva, vm_size); + if (!hyp_vm) + goto err_remove_mappings; + + pgd = map_donated_memory_noclear(pgd_hva, pgd_size); + if (!pgd) + goto err_remove_mappings; + + init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle); + + ret = kvm_guest_prepare_stage2(hyp_vm, pgd); + if (ret) + goto err_remove_mappings; + + /* Must be called last since this publishes the VM. */ + ret = insert_vm_table_entry(handle, hyp_vm); + if (ret) + goto err_remove_mappings; + + return 0; + +err_remove_mappings: + unmap_donated_memory(hyp_vm, vm_size); + unmap_donated_memory(pgd, pgd_size); +err_unpin_kvm: + hyp_unpin_shared_mem(host_kvm, host_kvm + 1); + return ret; +} + +/* + * Initialize the hypervisor copy of the vCPU state using host-donated memory. + * + * handle: The hypervisor handle for the vm. + * host_vcpu: A pointer to the corresponding host vcpu. + * vcpu_hva: The host va of the area being donated for the vcpu state. + * Must be page aligned. The size of the area must be equal to + * the page-aligned size of 'struct pkvm_hyp_vcpu'. + * Return 0 on success, negative error code on failure. + */ +int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, + unsigned long vcpu_hva) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + struct pkvm_hyp_vm *hyp_vm; + unsigned int idx; + int ret; + + hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu)); + if (!hyp_vcpu) + return -ENOMEM; + + hyp_spin_lock(&vm_table_lock); + + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + ret = -ENOENT; + goto unlock; + } + + ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu); + if (ret) + goto unlock; + + idx = hyp_vcpu->vcpu.vcpu_idx; + if (idx >= hyp_vm->kvm.created_vcpus) { + ret = -EINVAL; + goto unlock; + } + + if (hyp_vm->vcpus[idx]) { + ret = -EINVAL; + goto unlock; + } + + hyp_vm->vcpus[idx] = hyp_vcpu; +unlock: + hyp_spin_unlock(&vm_table_lock); + + if (ret) + unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); + return ret; +} + +static void +teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) +{ + size = PAGE_ALIGN(size); + memset(addr, 0, size); + + for (void *start = addr; start < addr + size; start += PAGE_SIZE) + push_hyp_memcache(mc, start, hyp_virt_to_phys); + + unmap_donated_memory_noclear(addr, size); +} + +int __pkvm_teardown_vm(pkvm_handle_t handle) +{ + struct kvm_hyp_memcache *mc, *stage2_mc; + struct pkvm_hyp_vm *hyp_vm; + struct kvm *host_kvm; + unsigned int idx; + size_t vm_size; + int err; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + err = -ENOENT; + goto err_unlock; + } + + if (WARN_ON(hyp_page_count(hyp_vm))) { + err = -EBUSY; + goto err_unlock; + } + + host_kvm = hyp_vm->host_kvm; + + /* Ensure the VMID is clean before it can be reallocated */ + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + remove_vm_table_entry(handle); + hyp_spin_unlock(&vm_table_lock); + + /* Reclaim guest pages (including page-table pages) */ + mc = &host_kvm->arch.pkvm.teardown_mc; + stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc; + reclaim_pgtable_pages(hyp_vm, stage2_mc); + unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus); + + /* Push the metadata pages to the teardown memcache */ + for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; + struct kvm_hyp_memcache *vcpu_mc; + + if (!hyp_vcpu) + continue; + + vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; + + while (vcpu_mc->nr_pages) { + void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); + + push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys); + unmap_donated_memory_noclear(addr, PAGE_SIZE); + } + + teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); + } + + vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus); + teardown_donated_memory(mc, hyp_vm, vm_size); + hyp_unpin_shared_mem(host_kvm, host_kvm + 1); + return 0; + +err_unlock: + hyp_spin_unlock(&vm_table_lock); + return err; +} diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c new file mode 100644 index 000000000000..c3e196fb8b18 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google LLC + * Author: David Brazdil <dbrazdil@google.com> + */ + +#include <asm/kvm_asm.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> +#include <uapi/linux/psci.h> + +#include <nvhe/memory.h> +#include <nvhe/trap_handler.h> + +void kvm_hyp_cpu_entry(unsigned long r0); +void kvm_hyp_cpu_resume(unsigned long r0); + +void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); + +/* Config options set by the host. */ +struct kvm_host_psci_config __ro_after_init kvm_host_psci_config; + +#define INVALID_CPU_ID UINT_MAX + +struct psci_boot_args { + atomic_t lock; + unsigned long pc; + unsigned long r0; +}; + +#define PSCI_BOOT_ARGS_UNLOCKED 0 +#define PSCI_BOOT_ARGS_LOCKED 1 + +#define PSCI_BOOT_ARGS_INIT \ + ((struct psci_boot_args){ \ + .lock = ATOMIC_INIT(PSCI_BOOT_ARGS_UNLOCKED), \ + }) + +static DEFINE_PER_CPU(struct psci_boot_args, cpu_on_args) = PSCI_BOOT_ARGS_INIT; +static DEFINE_PER_CPU(struct psci_boot_args, suspend_args) = PSCI_BOOT_ARGS_INIT; + +#define is_psci_0_1(what, func_id) \ + (kvm_host_psci_config.psci_0_1_ ## what ## _implemented && \ + (func_id) == kvm_host_psci_config.function_ids_0_1.what) + +static bool is_psci_0_1_call(u64 func_id) +{ + return (is_psci_0_1(cpu_suspend, func_id) || + is_psci_0_1(cpu_on, func_id) || + is_psci_0_1(cpu_off, func_id) || + is_psci_0_1(migrate, func_id)); +} + +static bool is_psci_0_2_call(u64 func_id) +{ + /* SMCCC reserves IDs 0x00-1F with the given 32/64-bit base for PSCI. */ + return (PSCI_0_2_FN(0) <= func_id && func_id <= PSCI_0_2_FN(31)) || + (PSCI_0_2_FN64(0) <= func_id && func_id <= PSCI_0_2_FN64(31)); +} + +static unsigned long psci_call(unsigned long fn, unsigned long arg0, + unsigned long arg1, unsigned long arg2) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_smc(fn, arg0, arg1, arg2, &res); + return res.a0; +} + +static unsigned long psci_forward(struct kvm_cpu_context *host_ctxt) +{ + return psci_call(cpu_reg(host_ctxt, 0), cpu_reg(host_ctxt, 1), + cpu_reg(host_ctxt, 2), cpu_reg(host_ctxt, 3)); +} + +static unsigned int find_cpu_id(u64 mpidr) +{ + unsigned int i; + + /* Reject invalid MPIDRs */ + if (mpidr & ~MPIDR_HWID_BITMASK) + return INVALID_CPU_ID; + + for (i = 0; i < NR_CPUS; i++) { + if (cpu_logical_map(i) == mpidr) + return i; + } + + return INVALID_CPU_ID; +} + +static __always_inline bool try_acquire_boot_args(struct psci_boot_args *args) +{ + return atomic_cmpxchg_acquire(&args->lock, + PSCI_BOOT_ARGS_UNLOCKED, + PSCI_BOOT_ARGS_LOCKED) == + PSCI_BOOT_ARGS_UNLOCKED; +} + +static __always_inline void release_boot_args(struct psci_boot_args *args) +{ + atomic_set_release(&args->lock, PSCI_BOOT_ARGS_UNLOCKED); +} + +static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, mpidr, host_ctxt, 1); + DECLARE_REG(unsigned long, pc, host_ctxt, 2); + DECLARE_REG(unsigned long, r0, host_ctxt, 3); + + unsigned int cpu_id; + struct psci_boot_args *boot_args; + struct kvm_nvhe_init_params *init_params; + int ret; + + /* + * Find the logical CPU ID for the given MPIDR. The search set is + * the set of CPUs that were online at the point of KVM initialization. + * Booting other CPUs is rejected because their cpufeatures were not + * checked against the finalized capabilities. This could be relaxed + * by doing the feature checks in hyp. + */ + cpu_id = find_cpu_id(mpidr); + if (cpu_id == INVALID_CPU_ID) + return PSCI_RET_INVALID_PARAMS; + + boot_args = per_cpu_ptr(&cpu_on_args, cpu_id); + init_params = per_cpu_ptr(&kvm_init_params, cpu_id); + + /* Check if the target CPU is already being booted. */ + if (!try_acquire_boot_args(boot_args)) + return PSCI_RET_ALREADY_ON; + + boot_args->pc = pc; + boot_args->r0 = r0; + wmb(); + + ret = psci_call(func_id, mpidr, + __hyp_pa(&kvm_hyp_cpu_entry), + __hyp_pa(init_params)); + + /* If successful, the lock will be released by the target CPU. */ + if (ret != PSCI_RET_SUCCESS) + release_boot_args(boot_args); + + return ret; +} + +static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, power_state, host_ctxt, 1); + DECLARE_REG(unsigned long, pc, host_ctxt, 2); + DECLARE_REG(unsigned long, r0, host_ctxt, 3); + + struct psci_boot_args *boot_args; + struct kvm_nvhe_init_params *init_params; + + boot_args = this_cpu_ptr(&suspend_args); + init_params = this_cpu_ptr(&kvm_init_params); + + /* + * No need to acquire a lock before writing to boot_args because a core + * can only suspend itself. Racy CPU_ON calls use a separate struct. + */ + boot_args->pc = pc; + boot_args->r0 = r0; + + /* + * Will either return if shallow sleep state, or wake up into the entry + * point if it is a deep sleep state. + */ + return psci_call(func_id, power_state, + __hyp_pa(&kvm_hyp_cpu_resume), + __hyp_pa(init_params)); +} + +static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, pc, host_ctxt, 1); + DECLARE_REG(unsigned long, r0, host_ctxt, 2); + + struct psci_boot_args *boot_args; + struct kvm_nvhe_init_params *init_params; + + boot_args = this_cpu_ptr(&suspend_args); + init_params = this_cpu_ptr(&kvm_init_params); + + /* + * No need to acquire a lock before writing to boot_args because a core + * can only suspend itself. Racy CPU_ON calls use a separate struct. + */ + boot_args->pc = pc; + boot_args->r0 = r0; + + /* Will only return on error. */ + return psci_call(func_id, + __hyp_pa(&kvm_hyp_cpu_resume), + __hyp_pa(init_params), 0); +} + +asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) +{ + struct psci_boot_args *boot_args; + struct kvm_cpu_context *host_ctxt; + + host_ctxt = host_data_ptr(host_ctxt); + + if (is_cpu_on) + boot_args = this_cpu_ptr(&cpu_on_args); + else + boot_args = this_cpu_ptr(&suspend_args); + + cpu_reg(host_ctxt, 0) = boot_args->r0; + write_sysreg_el2(boot_args->pc, SYS_ELR); + + if (is_cpu_on) + release_boot_args(boot_args); + + write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR); + write_sysreg(INIT_PSTATE_EL1, SPSR_EL2); + + __host_enter(host_ctxt); +} + +static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + if (is_psci_0_1(cpu_off, func_id) || is_psci_0_1(migrate, func_id)) + return psci_forward(host_ctxt); + if (is_psci_0_1(cpu_on, func_id)) + return psci_cpu_on(func_id, host_ctxt); + if (is_psci_0_1(cpu_suspend, func_id)) + return psci_cpu_suspend(func_id, host_ctxt); + + return PSCI_RET_NOT_SUPPORTED; +} + +static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + switch (func_id) { + case PSCI_0_2_FN_PSCI_VERSION: + case PSCI_0_2_FN_CPU_OFF: + case PSCI_0_2_FN64_AFFINITY_INFO: + case PSCI_0_2_FN64_MIGRATE: + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: + case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU: + return psci_forward(host_ctxt); + /* + * SYSTEM_OFF/RESET should not return according to the spec. + * Allow it so as to stay robust to broken firmware. + */ + case PSCI_0_2_FN_SYSTEM_OFF: + case PSCI_0_2_FN_SYSTEM_RESET: + return psci_forward(host_ctxt); + case PSCI_0_2_FN64_CPU_SUSPEND: + return psci_cpu_suspend(func_id, host_ctxt); + case PSCI_0_2_FN64_CPU_ON: + return psci_cpu_on(func_id, host_ctxt); + default: + return PSCI_RET_NOT_SUPPORTED; + } +} + +static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + switch (func_id) { + case PSCI_1_0_FN_PSCI_FEATURES: + case PSCI_1_0_FN_SET_SUSPEND_MODE: + case PSCI_1_1_FN64_SYSTEM_RESET2: + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: + return psci_forward(host_ctxt); + case PSCI_1_0_FN64_SYSTEM_SUSPEND: + return psci_system_suspend(func_id, host_ctxt); + default: + return psci_0_2_handler(func_id, host_ctxt); + } +} + +bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) +{ + unsigned long ret; + + switch (kvm_host_psci_config.version) { + case PSCI_VERSION(0, 1): + if (!is_psci_0_1_call(func_id)) + return false; + ret = psci_0_1_handler(func_id, host_ctxt); + break; + case PSCI_VERSION(0, 2): + if (!is_psci_0_2_call(func_id)) + return false; + ret = psci_0_2_handler(func_id, host_ctxt); + break; + default: + if (!is_psci_0_2_call(func_id)) + return false; + ret = psci_1_0_handler(func_id, host_ctxt); + break; + } + + cpu_reg(host_ctxt, 0) = ret; + cpu_reg(host_ctxt, 1) = 0; + cpu_reg(host_ctxt, 2) = 0; + cpu_reg(host_ctxt, 3) = 0; + return true; +} diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c new file mode 100644 index 000000000000..90bd014e952f --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> + +#include <nvhe/early_alloc.h> +#include <nvhe/ffa.h> +#include <nvhe/gfp.h> +#include <nvhe/memory.h> +#include <nvhe/mem_protect.h> +#include <nvhe/mm.h> +#include <nvhe/pkvm.h> +#include <nvhe/trap_handler.h> + +unsigned long hyp_nr_cpus; + +#define hyp_percpu_size ((unsigned long)__per_cpu_end - \ + (unsigned long)__per_cpu_start) + +static void *vmemmap_base; +static void *vm_table_base; +static void *hyp_pgt_base; +static void *host_s2_pgt_base; +static void *selftest_base; +static void *ffa_proxy_pages; +static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; +static struct hyp_pool hpool; + +static int divide_memory_pool(void *virt, unsigned long size) +{ + unsigned long nr_pages; + + hyp_early_alloc_init(virt, size); + + nr_pages = pkvm_selftest_pages(); + selftest_base = hyp_early_alloc_contig(nr_pages); + if (nr_pages && !selftest_base) + return -ENOMEM; + + nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page)); + vmemmap_base = hyp_early_alloc_contig(nr_pages); + if (!vmemmap_base) + return -ENOMEM; + + nr_pages = hyp_vm_table_pages(); + vm_table_base = hyp_early_alloc_contig(nr_pages); + if (!vm_table_base) + return -ENOMEM; + + nr_pages = hyp_s1_pgtable_pages(); + hyp_pgt_base = hyp_early_alloc_contig(nr_pages); + if (!hyp_pgt_base) + return -ENOMEM; + + nr_pages = host_s2_pgtable_pages(); + host_s2_pgt_base = hyp_early_alloc_contig(nr_pages); + if (!host_s2_pgt_base) + return -ENOMEM; + + nr_pages = hyp_ffa_proxy_pages(); + ffa_proxy_pages = hyp_early_alloc_contig(nr_pages); + if (!ffa_proxy_pages) + return -ENOMEM; + + return 0; +} + +static int pkvm_create_host_sve_mappings(void) +{ + void *start, *end; + int ret, i; + + if (!system_supports_sve()) + return 0; + + for (i = 0; i < hyp_nr_cpus; i++) { + struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i); + struct cpu_sve_state *sve_state = host_data->sve_state; + + start = kern_hyp_va(sve_state); + end = start + PAGE_ALIGN(pkvm_host_sve_state_size()); + ret = pkvm_create_mappings(start, end, PAGE_HYP); + if (ret) + return ret; + } + + return 0; +} + +static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, + unsigned long *per_cpu_base, + u32 hyp_va_bits) +{ + void *start, *end, *virt = hyp_phys_to_virt(phys); + unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; + int ret, i; + + /* Recreate the hyp page-table using the early page allocator */ + hyp_early_alloc_init(hyp_pgt_base, pgt_size); + ret = kvm_pgtable_hyp_init(&pkvm_pgtable, hyp_va_bits, + &hyp_early_alloc_mm_ops); + if (ret) + return ret; + + ret = hyp_create_idmap(hyp_va_bits); + if (ret) + return ret; + + ret = hyp_map_vectors(); + if (ret) + return ret; + + ret = hyp_back_vmemmap(hyp_virt_to_phys(vmemmap_base)); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_text_start, __hyp_text_end, PAGE_HYP_EXEC); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_bss_start, __hyp_bss_end, PAGE_HYP); + if (ret) + return ret; + + ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP); + if (ret) + return ret; + + for (i = 0; i < hyp_nr_cpus; i++) { + struct kvm_nvhe_init_params *params = per_cpu_ptr(&kvm_init_params, i); + + start = (void *)kern_hyp_va(per_cpu_base[i]); + end = start + PAGE_ALIGN(hyp_percpu_size); + ret = pkvm_create_mappings(start, end, PAGE_HYP); + if (ret) + return ret; + + ret = pkvm_create_stack(params->stack_pa, ¶ms->stack_hyp_va); + if (ret) + return ret; + } + + return pkvm_create_host_sve_mappings(); +} + +static void update_nvhe_init_params(void) +{ + struct kvm_nvhe_init_params *params; + unsigned long i; + + for (i = 0; i < hyp_nr_cpus; i++) { + params = per_cpu_ptr(&kvm_init_params, i); + params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd); + dcache_clean_inval_poc((unsigned long)params, + (unsigned long)params + sizeof(*params)); + } +} + +static void *hyp_zalloc_hyp_page(void *arg) +{ + return hyp_alloc_pages(&hpool, 0); +} + +static void hpool_get_page(void *addr) +{ + hyp_get_page(&hpool, addr); +} + +static void hpool_put_page(void *addr) +{ + hyp_put_page(&hpool, addr); +} + +static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + enum pkvm_page_state state; + struct hyp_page *page; + phys_addr_t phys; + enum kvm_pgtable_prot prot; + + if (!kvm_pte_valid(ctx->old)) + return 0; + + if (ctx->level != KVM_PGTABLE_LAST_LEVEL) + return -EINVAL; + + phys = kvm_pte_to_phys(ctx->old); + if (!addr_is_memory(phys)) + return -EINVAL; + + page = hyp_phys_to_page(phys); + + /* + * Adjust the host stage-2 mappings to match the ownership attributes + * configured in the hypervisor stage-1, and make sure to propagate them + * to the hyp_vmemmap state. + */ + prot = kvm_pgtable_hyp_pte_prot(ctx->old); + state = pkvm_getstate(prot); + switch (state) { + case PKVM_PAGE_OWNED: + set_hyp_state(page, PKVM_PAGE_OWNED); + /* hyp text is RO in the host stage-2 to be inspected on panic. */ + if (prot == PAGE_HYP_EXEC) { + set_host_state(page, PKVM_NOPAGE); + return host_stage2_idmap_locked(phys, PAGE_SIZE, KVM_PGTABLE_PROT_R); + } else { + return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); + } + case PKVM_PAGE_SHARED_OWNED: + set_hyp_state(page, PKVM_PAGE_SHARED_OWNED); + set_host_state(page, PKVM_PAGE_SHARED_BORROWED); + break; + case PKVM_PAGE_SHARED_BORROWED: + set_hyp_state(page, PKVM_PAGE_SHARED_BORROWED); + set_host_state(page, PKVM_PAGE_SHARED_OWNED); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + /* + * Fix-up the refcount for the page-table pages as the early allocator + * was unable to access the hyp_vmemmap and so the buddy allocator has + * initialised the refcount to '1'. + */ + if (kvm_pte_valid(ctx->old)) + ctx->mm_ops->get_page(ctx->ptep); + + return 0; +} + +static int fix_host_ownership(void) +{ + struct kvm_pgtable_walker walker = { + .cb = fix_host_ownership_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + int i, ret; + + for (i = 0; i < hyp_memblock_nr; i++) { + struct memblock_region *reg = &hyp_memory[i]; + u64 start = (u64)hyp_phys_to_virt(reg->base); + + ret = kvm_pgtable_walk(&pkvm_pgtable, start, reg->size, &walker); + if (ret) + return ret; + } + + return 0; +} + +static int fix_hyp_pgtable_refcnt(void) +{ + struct kvm_pgtable_walker walker = { + .cb = fix_hyp_pgtable_refcnt_walker, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + .arg = pkvm_pgtable.mm_ops, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), + &walker); +} + +void __noreturn __pkvm_init_finalise(void) +{ + struct kvm_cpu_context *host_ctxt = host_data_ptr(host_ctxt); + unsigned long nr_pages, reserved_pages, pfn; + int ret; + + /* Now that the vmemmap is backed, install the full-fledged allocator */ + pfn = hyp_virt_to_pfn(hyp_pgt_base); + nr_pages = hyp_s1_pgtable_pages(); + reserved_pages = hyp_early_alloc_nr_used_pages(); + ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages); + if (ret) + goto out; + + ret = kvm_host_prepare_stage2(host_s2_pgt_base); + if (ret) + goto out; + + pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) { + .zalloc_page = hyp_zalloc_hyp_page, + .phys_to_virt = hyp_phys_to_virt, + .virt_to_phys = hyp_virt_to_phys, + .get_page = hpool_get_page, + .put_page = hpool_put_page, + .page_count = hyp_page_count, + }; + pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops; + + ret = fix_host_ownership(); + if (ret) + goto out; + + ret = fix_hyp_pgtable_refcnt(); + if (ret) + goto out; + + ret = hyp_create_fixmap(); + if (ret) + goto out; + + ret = hyp_ffa_init(ffa_proxy_pages); + if (ret) + goto out; + + pkvm_hyp_vm_table_init(vm_table_base); + + pkvm_ownership_selftest(selftest_base); +out: + /* + * We tail-called to here from handle___pkvm_init() and will not return, + * so make sure to propagate the return value to the host. + */ + cpu_reg(host_ctxt, 1) = ret; + + __host_enter(host_ctxt); +} + +int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, + unsigned long *per_cpu_base, u32 hyp_va_bits) +{ + struct kvm_nvhe_init_params *params; + void *virt = hyp_phys_to_virt(phys); + typeof(__pkvm_init_switch_pgd) *fn; + int ret; + + BUG_ON(kvm_check_pvm_sysreg_table()); + + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) + return -EINVAL; + + hyp_spin_lock_init(&pkvm_pgd_lock); + hyp_nr_cpus = nr_cpus; + + ret = divide_memory_pool(virt, size); + if (ret) + return ret; + + ret = recreate_hyp_mappings(phys, size, per_cpu_base, hyp_va_bits); + if (ret) + return ret; + + update_nvhe_init_params(); + + /* Jump in the idmap page to switch to the new page-tables */ + params = this_cpu_ptr(&kvm_init_params); + fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd); + fn(params->pgd_pa, params->stack_hyp_va, __pkvm_init_finalise); + + unreachable(); +} diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c new file mode 100644 index 000000000000..5b6eeab1a774 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM nVHE hypervisor stack tracing support. + * + * Copyright (C) 2022 Google LLC + */ +#include <asm/kvm_asm.h> +#include <asm/kvm_hyp.h> +#include <asm/memory.h> +#include <asm/percpu.h> + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +DEFINE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); + +/* + * hyp_prepare_backtrace - Prepare non-protected nVHE backtrace. + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the information needed by the host to unwind the non-protected + * nVHE hypervisor stack in EL1. + */ +static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - NVHE_STACK_SIZE); + stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); + stacktrace_info->fp = fp; + stacktrace_info->pc = pc; +} + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +#include <asm/stacktrace/nvhe.h> + +DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); + +static struct stack_info stackinfo_get_overflow(void) +{ + unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} + +static struct stack_info stackinfo_get_hyp(void) +{ + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + unsigned long high = params->stack_hyp_va; + unsigned long low = high - NVHE_STACK_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} + +static int unwind_next(struct unwind_state *state) +{ + return unwind_next_frame_record(state); +} + +static void notrace unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, + void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + +/* + * pkvm_save_backtrace_entry - Saves a protected nVHE HYP stacktrace entry + * + * @arg : index of the entry in the stacktrace buffer + * @where : the program counter corresponding to the stack frame + * + * Save the return address of a stack frame to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static bool pkvm_save_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long *stacktrace = this_cpu_ptr(pkvm_stacktrace); + int *idx = (int *)arg; + + /* + * Need 2 free slots: 1 for current entry and 1 for the + * delimiter. + */ + if (*idx > ARRAY_SIZE(pkvm_stacktrace) - 2) + return false; + + stacktrace[*idx] = where; + stacktrace[++*idx] = 0UL; + + return true; +} + +/* + * pkvm_save_backtrace - Saves the protected nVHE HYP stacktrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the unwinded stack addresses to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ + struct stack_info stacks[] = { + stackinfo_get_overflow(), + stackinfo_get_hyp(), + }; + struct unwind_state state = { + .stacks = stacks, + .nr_stacks = ARRAY_SIZE(stacks), + }; + int idx = 0; + + kvm_nvhe_unwind_init(&state, fp, pc); + + unwind(&state, pkvm_save_backtrace_entry, &idx); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +/* + * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Saves the information needed by the host to dump the nVHE hypervisor + * backtrace. + */ +void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + if (is_protected_kvm_enabled()) + pkvm_save_backtrace(fp, pc); + else + hyp_prepare_backtrace(fp, pc); +} diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c new file mode 100644 index 000000000000..d3b9ec8a7c28 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -0,0 +1,390 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <hyp/switch.h> +#include <hyp/sysreg-sr.h> + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> +#include <linux/types.h> +#include <linux/jump_label.h> +#include <uapi/linux/psci.h> + +#include <kvm/arm_psci.h> + +#include <asm/barrier.h> +#include <asm/cpufeature.h> +#include <asm/kprobes.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/fpsimd.h> +#include <asm/debug-monitors.h> +#include <asm/processor.h> + +#include <nvhe/mem_protect.h> + +/* Non-VHE specific context */ +DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); +DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); +DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); + +struct fgt_masks hfgrtr_masks; +struct fgt_masks hfgwtr_masks; +struct fgt_masks hfgitr_masks; +struct fgt_masks hdfgrtr_masks; +struct fgt_masks hdfgwtr_masks; +struct fgt_masks hafgrtr_masks; +struct fgt_masks hfgrtr2_masks; +struct fgt_masks hfgwtr2_masks; +struct fgt_masks hfgitr2_masks; +struct fgt_masks hdfgrtr2_masks; +struct fgt_masks hdfgwtr2_masks; + +extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); + +static void __activate_traps(struct kvm_vcpu *vcpu) +{ + ___activate_traps(vcpu, vcpu->arch.hcr_el2); + + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); + + __activate_traps_common(vcpu); + __activate_cptr_traps(vcpu); + + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); + + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { + struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt; + + isb(); + /* + * At this stage, and thanks to the above isb(), S2 is + * configured and enabled. We can now restore the guest's S1 + * configuration: SCTLR, and only then TCR. + */ + write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL1), SYS_SCTLR); + isb(); + write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL1), SYS_TCR); + } +} + +static void __deactivate_traps(struct kvm_vcpu *vcpu) +{ + extern char __kvm_hyp_host_vector[]; + + ___deactivate_traps(vcpu); + + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { + u64 val; + + /* + * Set the TCR and SCTLR registers in the exact opposite + * sequence as __activate_traps (first prevent walks, + * then force the MMU on). A generous sprinkling of isb() + * ensure that things happen in this exact order. + */ + val = read_sysreg_el1(SYS_TCR); + write_sysreg_el1(val | TCR_EPD1_MASK | TCR_EPD0_MASK, SYS_TCR); + isb(); + val = read_sysreg_el1(SYS_SCTLR); + write_sysreg_el1(val | SCTLR_ELx_M, SYS_SCTLR); + isb(); + } + + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + + __deactivate_traps_common(vcpu); + + write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2); + + __deactivate_cptr_traps(vcpu); + write_sysreg(__kvm_hyp_host_vector, vbar_el2); +} + +/* Save VGICv3 state on non-VHE systems */ +static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu) +{ + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { + __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); + } +} + +/* Restore VGICv3 state on non-VHE systems */ +static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) +{ + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); + __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); + } +} + +/* + * Disable host events, enable guest events + */ +#ifdef CONFIG_HW_PERF_EVENTS +static bool __pmu_switch_to_guest(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events; + + if (pmu->events_host) + write_sysreg(pmu->events_host, pmcntenclr_el0); + + if (pmu->events_guest) + write_sysreg(pmu->events_guest, pmcntenset_el0); + + return (pmu->events_host || pmu->events_guest); +} + +/* + * Disable guest events, enable host events + */ +static void __pmu_switch_to_host(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events; + + if (pmu->events_guest) + write_sysreg(pmu->events_guest, pmcntenclr_el0); + + if (pmu->events_host) + write_sysreg(pmu->events_host, pmcntenset_el0); +} +#else +#define __pmu_switch_to_guest(v) ({ false; }) +#define __pmu_switch_to_host(v) do {} while (0) +#endif + +/* + * Handler for protected VM MSR, MRS or System instruction execution in AArch64. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* + * Make sure we handle the exit for workarounds before the pKVM + * handling, as the latter could decide to UNDEF. + */ + return (kvm_hyp_handle_sysreg(vcpu, exit_code) || + kvm_handle_pvm_sysreg(vcpu, exit_code)); +} + +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, + [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, +}; + +static const exit_handler_fn pvm_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64, + [ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, + [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, +}; + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +{ + if (unlikely(vcpu_is_protected(vcpu))) + return pvm_exit_handlers; + + return hyp_exit_handlers; +} + +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + + synchronize_vcpu_pstate(vcpu, exit_code); + + /* + * Some guests (e.g., protected VMs) are not be allowed to run in + * AArch32. The ARMv8 architecture does not give the hypervisor a + * mechanism to prevent a guest from dropping to AArch32 EL0 if + * implemented by the CPU. If the hypervisor spots a guest in such a + * state ensure it is handled, and don't trust the host to spot or fix + * it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + */ + if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { + /* + * As we have caught the guest red-handed, decide that it isn't + * fit for purpose anymore by making the vcpu invalid. The VMM + * can try and fix it by re-initializing the vcpu with + * KVM_ARM_VCPU_INIT, however, this is likely not possible for + * protected VMs. + */ + vcpu_clear_flag(vcpu, VCPU_INITIALIZED); + *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); + *exit_code |= ARM_EXCEPTION_IL; + } + + return __fixup_guest_exit(vcpu, exit_code, handlers); +} + +/* Switch to the guest for legacy non-VHE systems */ +int __kvm_vcpu_run(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *host_ctxt; + struct kvm_cpu_context *guest_ctxt; + struct kvm_s2_mmu *mmu; + bool pmu_switch_needed; + u64 exit_code; + + /* + * Having IRQs masked via PMR when entering the guest means the GIC + * will not signal the CPU of interrupts of lower priority, and the + * only way to get out will be via guest exceptions. + * Naturally, we want to avoid this. + */ + if (system_uses_irq_prio_masking()) { + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + pmr_sync(); + } + + host_ctxt = host_data_ptr(host_ctxt); + host_ctxt->__hyp_running_vcpu = vcpu; + guest_ctxt = &vcpu->arch.ctxt; + + pmu_switch_needed = __pmu_switch_to_guest(vcpu); + + __sysreg_save_state_nvhe(host_ctxt); + /* + * We must flush and disable the SPE buffer for nVHE, as + * the translation regime(EL1&0) is going to be loaded with + * that of the guest. And we must do this before we change the + * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and + * before we load guest Stage1. + */ + __debug_save_host_buffers_nvhe(vcpu); + + /* + * We're about to restore some new MMU state. Make sure + * ongoing page-table walks that have started before we + * trapped to EL2 have completed. This also synchronises the + * above disabling of BRBE, SPE and TRBE. + * + * See DDI0487I.a D8.1.5 "Out-of-context translation regimes", + * rule R_LFHQG and subsequent information statements. + */ + dsb(nsh); + + __kvm_adjust_pc(vcpu); + + /* + * We must restore the 32-bit state before the sysregs, thanks + * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72). + * + * Also, and in order to be able to deal with erratum #1319537 (A57) + * and #1319367 (A72), we must ensure that all VM-related sysreg are + * restored before we enable S2 translation. + */ + __sysreg32_restore_state(vcpu); + __sysreg_restore_state_nvhe(guest_ctxt); + + mmu = kern_hyp_va(vcpu->arch.hw_mmu); + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + __activate_traps(vcpu); + + __hyp_vgic_restore_state(vcpu); + __timer_enable_traps(vcpu); + + __debug_switch_to_guest(vcpu); + + do { + /* Jump in the fire! */ + exit_code = __guest_enter(vcpu); + + /* And we're baaack! */ + } while (fixup_guest_exit(vcpu, &exit_code)); + + __sysreg_save_state_nvhe(guest_ctxt); + __sysreg32_save_state(vcpu); + __timer_disable_traps(vcpu); + __hyp_vgic_save_state(vcpu); + + /* + * Same thing as before the guest run: we're about to switch + * the MMU context, so let's make sure we don't have any + * ongoing EL1&0 translations. + */ + dsb(nsh); + + __deactivate_traps(vcpu); + __load_host_stage2(); + + __sysreg_restore_state_nvhe(host_ctxt); + + if (guest_owns_fp_regs()) + __fpsimd_save_fpexc32(vcpu); + + __debug_switch_to_host(vcpu); + /* + * This must come after restoring the host sysregs, since a non-VHE + * system may enable SPE here and make use of the TTBRs. + */ + __debug_restore_host_buffers_nvhe(vcpu); + + if (pmu_switch_needed) + __pmu_switch_to_host(vcpu); + + /* Returning to host will clear PSR.I, remask PMR if needed */ + if (system_uses_irq_prio_masking()) + gic_write_pmr(GIC_PRIO_IRQOFF); + + host_ctxt->__hyp_running_vcpu = NULL; + + return exit_code; +} + +asmlinkage void __noreturn hyp_panic(void) +{ + u64 spsr = read_sysreg_el2(SYS_SPSR); + u64 elr = read_sysreg_el2(SYS_ELR); + u64 par = read_sysreg_par(); + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = host_data_ptr(host_ctxt); + vcpu = host_ctxt->__hyp_running_vcpu; + + if (vcpu) { + __timer_disable_traps(vcpu); + __deactivate_traps(vcpu); + __load_host_stage2(); + __sysreg_restore_state_nvhe(host_ctxt); + } + + /* Prepare to dump kvm nvhe hyp stacktrace */ + kvm_nvhe_prepare_backtrace((unsigned long)__builtin_frame_address(0), + _THIS_IP_); + + __hyp_do_panic(host_ctxt, spsr, elr, par); + unreachable(); +} + +asmlinkage void __noreturn hyp_panic_bad_stack(void) +{ + hyp_panic(); +} + +asmlinkage void kvm_unexpected_el2_exception(void) +{ + __kvm_unexpected_el2_exception(); +} diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c new file mode 100644 index 000000000000..3108b5185c20 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -0,0 +1,576 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#include <linux/irqchip/arm-gic-v3.h> + +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> + +#include <hyp/adjust_pc.h> + +#include <nvhe/pkvm.h> + +#include "../../sys_regs.h" + +/* + * Copies of the host's CPU features registers holding sanitized values at hyp. + */ +u64 id_aa64pfr0_el1_sys_val; +u64 id_aa64pfr1_el1_sys_val; +u64 id_aa64isar0_el1_sys_val; +u64 id_aa64isar1_el1_sys_val; +u64 id_aa64isar2_el1_sys_val; +u64 id_aa64mmfr0_el1_sys_val; +u64 id_aa64mmfr1_el1_sys_val; +u64 id_aa64mmfr2_el1_sys_val; +u64 id_aa64smfr0_el1_sys_val; + +struct pvm_ftr_bits { + bool sign; + u8 shift; + u8 width; + u8 max_val; + bool (*vm_supported)(const struct kvm *kvm); +}; + +#define __MAX_FEAT_FUNC(id, fld, max, func, sgn) \ + { \ + .sign = sgn, \ + .shift = id##_##fld##_SHIFT, \ + .width = id##_##fld##_WIDTH, \ + .max_val = id##_##fld##_##max, \ + .vm_supported = func, \ + } + +#define MAX_FEAT_FUNC(id, fld, max, func) \ + __MAX_FEAT_FUNC(id, fld, max, func, id##_##fld##_SIGNED) + +#define MAX_FEAT(id, fld, max) \ + MAX_FEAT_FUNC(id, fld, max, NULL) + +#define MAX_FEAT_ENUM(id, fld, max) \ + __MAX_FEAT_FUNC(id, fld, max, NULL, false) + +#define FEAT_END { .width = 0, } + +static bool vm_has_ptrauth(const struct kvm *kvm) +{ + if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) + return false; + + return (cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || + cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC); +} + +static bool vm_has_sve(const struct kvm *kvm) +{ + return system_supports_sve() && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_SVE); +} + +/* + * Definitions for features to be allowed or restricted for protected guests. + * + * Each field in the masks represents the highest supported value for the + * feature. If a feature field is not present, it is not supported. Moreover, + * these are used to generate the guest's view of the feature registers. + * + * The approach for protected VMs is to at least support features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ + +static const struct pvm_ftr_bits pvmid_aa64pfr0[] = { + MAX_FEAT(ID_AA64PFR0_EL1, EL0, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL1, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL3, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, FP, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, AdvSIMD, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, GIC, IMP), + MAX_FEAT_FUNC(ID_AA64PFR0_EL1, SVE, IMP, vm_has_sve), + MAX_FEAT(ID_AA64PFR0_EL1, RAS, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, DIT, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV3, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64pfr1[] = { + MAX_FEAT(ID_AA64PFR1_EL1, BT, IMP), + MAX_FEAT(ID_AA64PFR1_EL1, SSBS, SSBS2), + MAX_FEAT_ENUM(ID_AA64PFR1_EL1, MTE_frac, NI), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = { + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40), + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGEND, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, SNSMEM, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGENDEL0, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, EXS, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64mmfr1[] = { + MAX_FEAT(ID_AA64MMFR1_EL1, HAFDBS, DBM), + MAX_FEAT_ENUM(ID_AA64MMFR1_EL1, VMIDBits, 16), + MAX_FEAT(ID_AA64MMFR1_EL1, HPDS, HPDS2), + MAX_FEAT(ID_AA64MMFR1_EL1, PAN, PAN3), + MAX_FEAT(ID_AA64MMFR1_EL1, SpecSEI, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, ETS, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, CMOW, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = { + MAX_FEAT(ID_AA64MMFR2_EL1, CnP, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP), + MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18), + MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2), + MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64isar1[] = { + MAX_FEAT(ID_AA64ISAR1_EL1, DPB, DPB2), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, APA, PAuth, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, API, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR1_EL1, JSCVT, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FCMA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, LRCPC, LRCPC3), + MAX_FEAT(ID_AA64ISAR1_EL1, GPA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, GPI, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FRINTTS, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SB, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX), + MAX_FEAT(ID_AA64ISAR1_EL1, BF16, EBF16), + MAX_FEAT(ID_AA64ISAR1_EL1, DGH, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, I8MM, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64isar2[] = { + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, GPA3, IMP, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, APA3, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR2_EL1, ATS1A, IMP), + FEAT_END +}; + +/* + * None of the features in ID_AA64DFR0_EL1 nor ID_AA64MMFR4_EL1 are supported. + * However, both have Not-Implemented values that are non-zero. Define them + * so they can be used when getting the value of these registers. + */ +#define ID_AA64DFR0_EL1_NONZERO_NI \ +( \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI) | \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, MTPMU, NI) \ +) + +#define ID_AA64MMFR4_EL1_NONZERO_NI \ + SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI) + +/* + * Returns the value of the feature registers based on the system register + * value, the vcpu support for the revelant features, and the additional + * restrictions for protected VMs. + */ +static u64 get_restricted_features(const struct kvm_vcpu *vcpu, + u64 sys_reg_val, + const struct pvm_ftr_bits restrictions[]) +{ + u64 val = 0UL; + int i; + + for (i = 0; restrictions[i].width != 0; i++) { + bool (*vm_supported)(const struct kvm *) = restrictions[i].vm_supported; + bool sign = restrictions[i].sign; + int shift = restrictions[i].shift; + int width = restrictions[i].width; + u64 min_signed = (1UL << width) - 1UL; + u64 sign_bit = 1UL << (width - 1); + u64 mask = GENMASK_ULL(width + shift - 1, shift); + u64 sys_val = (sys_reg_val & mask) >> shift; + u64 pvm_max = restrictions[i].max_val; + + if (vm_supported && !vm_supported(vcpu->kvm)) + val |= (sign ? min_signed : 0) << shift; + else if (sign && (sys_val >= sign_bit || pvm_max >= sign_bit)) + val |= max(sys_val, pvm_max) << shift; + else + val |= min(sys_val, pvm_max) << shift; + } + + return val; +} + +static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) +{ + switch (id) { + case SYS_ID_AA64PFR0_EL1: + return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0); + case SYS_ID_AA64PFR1_EL1: + return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1); + case SYS_ID_AA64ISAR0_EL1: + return id_aa64isar0_el1_sys_val; + case SYS_ID_AA64ISAR1_EL1: + return get_restricted_features(vcpu, id_aa64isar1_el1_sys_val, pvmid_aa64isar1); + case SYS_ID_AA64ISAR2_EL1: + return get_restricted_features(vcpu, id_aa64isar2_el1_sys_val, pvmid_aa64isar2); + case SYS_ID_AA64MMFR0_EL1: + return get_restricted_features(vcpu, id_aa64mmfr0_el1_sys_val, pvmid_aa64mmfr0); + case SYS_ID_AA64MMFR1_EL1: + return get_restricted_features(vcpu, id_aa64mmfr1_el1_sys_val, pvmid_aa64mmfr1); + case SYS_ID_AA64MMFR2_EL1: + return get_restricted_features(vcpu, id_aa64mmfr2_el1_sys_val, pvmid_aa64mmfr2); + case SYS_ID_AA64DFR0_EL1: + return ID_AA64DFR0_EL1_NONZERO_NI; + case SYS_ID_AA64MMFR4_EL1: + return ID_AA64MMFR4_EL1_NONZERO_NI; + default: + /* Unhandled ID register, RAZ */ + return 0; + } +} + +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR)); + + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); +} + +static u64 read_id_reg(const struct kvm_vcpu *vcpu, + struct sys_reg_desc const *r) +{ + struct kvm *kvm = vcpu->kvm; + u32 reg = reg_to_encoding(r); + + if (WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))) + return 0; + + if (reg >= sys_reg(3, 0, 0, 1, 0) && reg <= sys_reg(3, 0, 0, 7, 7)) + return kvm->arch.id_regs[IDREG_IDX(reg)]; + + return 0; +} + +/* Handler to RAZ/WI sysregs */ +static bool pvm_access_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!p->is_write) + p->regval = 0; + + return true; +} + +/* + * Accessor for AArch32 feature id registers. + * + * The value of these registers is "unknown" according to the spec if AArch32 + * isn't supported. + */ +static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + return pvm_access_raz_wi(vcpu, p, r); +} + +/* + * Accessor for AArch64 feature id registers. + * + * If access is allowed, set the regval to the protected VM's view of the + * register and return true. + * Otherwise, inject an undefined exception and return false. + */ +static bool pvm_access_id_aarch64(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + p->regval = read_id_reg(vcpu, r); + return true; +} + +static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* pVMs only support GICv3. 'nuf said. */ + if (!p->is_write) + p->regval = ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB | ICC_SRE_EL1_SRE; + + return true; +} + +/* Mark the specified system register as an AArch32 feature id register. */ +#define AARCH32(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch32 } + +/* Mark the specified system register as an AArch64 feature id register. */ +#define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 } + +/* + * sys_reg_desc initialiser for architecturally unallocated cpufeature ID + * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 + * (1 <= crm < 8, 0 <= Op2 < 8). + */ +#define ID_UNALLOCATED(crm, op2) { \ + Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ + .access = pvm_access_id_aarch64, \ +} + +/* Mark the specified system register as Read-As-Zero/Write-Ignored */ +#define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi } + +/* Mark the specified system register as not being handled in hyp. */ +#define HOST_HANDLED(REG) { SYS_DESC(REG), .access = NULL } + +/* + * Architected system registers. + * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 + * + * NOTE: Anything not explicitly listed here is *restricted by default*, i.e., + * it will lead to injecting an exception into the guest. + */ +static const struct sys_reg_desc pvm_sys_reg_descs[] = { + /* Cache maintenance by set/way operations are restricted. */ + + /* Debug and Trace Registers are restricted. */ + + /* Group 1 ID registers */ + HOST_HANDLED(SYS_REVIDR_EL1), + + /* AArch64 mappings of the AArch32 ID registers */ + /* CRm=1 */ + AARCH32(SYS_ID_PFR0_EL1), + AARCH32(SYS_ID_PFR1_EL1), + AARCH32(SYS_ID_DFR0_EL1), + AARCH32(SYS_ID_AFR0_EL1), + AARCH32(SYS_ID_MMFR0_EL1), + AARCH32(SYS_ID_MMFR1_EL1), + AARCH32(SYS_ID_MMFR2_EL1), + AARCH32(SYS_ID_MMFR3_EL1), + + /* CRm=2 */ + AARCH32(SYS_ID_ISAR0_EL1), + AARCH32(SYS_ID_ISAR1_EL1), + AARCH32(SYS_ID_ISAR2_EL1), + AARCH32(SYS_ID_ISAR3_EL1), + AARCH32(SYS_ID_ISAR4_EL1), + AARCH32(SYS_ID_ISAR5_EL1), + AARCH32(SYS_ID_MMFR4_EL1), + AARCH32(SYS_ID_ISAR6_EL1), + + /* CRm=3 */ + AARCH32(SYS_MVFR0_EL1), + AARCH32(SYS_MVFR1_EL1), + AARCH32(SYS_MVFR2_EL1), + ID_UNALLOCATED(3,3), + AARCH32(SYS_ID_PFR2_EL1), + AARCH32(SYS_ID_DFR1_EL1), + AARCH32(SYS_ID_MMFR5_EL1), + ID_UNALLOCATED(3,7), + + /* AArch64 ID registers */ + /* CRm=4 */ + AARCH64(SYS_ID_AA64PFR0_EL1), + AARCH64(SYS_ID_AA64PFR1_EL1), + ID_UNALLOCATED(4,2), + ID_UNALLOCATED(4,3), + AARCH64(SYS_ID_AA64ZFR0_EL1), + ID_UNALLOCATED(4,5), + ID_UNALLOCATED(4,6), + ID_UNALLOCATED(4,7), + AARCH64(SYS_ID_AA64DFR0_EL1), + AARCH64(SYS_ID_AA64DFR1_EL1), + ID_UNALLOCATED(5,2), + ID_UNALLOCATED(5,3), + AARCH64(SYS_ID_AA64AFR0_EL1), + AARCH64(SYS_ID_AA64AFR1_EL1), + ID_UNALLOCATED(5,6), + ID_UNALLOCATED(5,7), + AARCH64(SYS_ID_AA64ISAR0_EL1), + AARCH64(SYS_ID_AA64ISAR1_EL1), + AARCH64(SYS_ID_AA64ISAR2_EL1), + ID_UNALLOCATED(6,3), + ID_UNALLOCATED(6,4), + ID_UNALLOCATED(6,5), + ID_UNALLOCATED(6,6), + ID_UNALLOCATED(6,7), + AARCH64(SYS_ID_AA64MMFR0_EL1), + AARCH64(SYS_ID_AA64MMFR1_EL1), + AARCH64(SYS_ID_AA64MMFR2_EL1), + ID_UNALLOCATED(7,3), + ID_UNALLOCATED(7,4), + ID_UNALLOCATED(7,5), + ID_UNALLOCATED(7,6), + ID_UNALLOCATED(7,7), + + /* Scalable Vector Registers are restricted. */ + + HOST_HANDLED(SYS_ICC_PMR_EL1), + + RAZ_WI(SYS_ERRIDR_EL1), + RAZ_WI(SYS_ERRSELR_EL1), + RAZ_WI(SYS_ERXFR_EL1), + RAZ_WI(SYS_ERXCTLR_EL1), + RAZ_WI(SYS_ERXSTATUS_EL1), + RAZ_WI(SYS_ERXADDR_EL1), + RAZ_WI(SYS_ERXMISC0_EL1), + RAZ_WI(SYS_ERXMISC1_EL1), + + /* Performance Monitoring Registers are restricted. */ + + /* Limited Ordering Regions Registers are restricted. */ + + HOST_HANDLED(SYS_ICC_DIR_EL1), + HOST_HANDLED(SYS_ICC_RPR_EL1), + HOST_HANDLED(SYS_ICC_SGI1R_EL1), + HOST_HANDLED(SYS_ICC_ASGI1R_EL1), + HOST_HANDLED(SYS_ICC_SGI0R_EL1), + HOST_HANDLED(SYS_ICC_CTLR_EL1), + { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, + + HOST_HANDLED(SYS_CCSIDR_EL1), + HOST_HANDLED(SYS_CLIDR_EL1), + HOST_HANDLED(SYS_AIDR_EL1), + HOST_HANDLED(SYS_CSSELR_EL1), + HOST_HANDLED(SYS_CTR_EL0), + + /* Performance Monitoring Registers are restricted. */ + + /* Activity Monitoring Registers are restricted. */ + + HOST_HANDLED(SYS_CNTP_TVAL_EL0), + HOST_HANDLED(SYS_CNTP_CTL_EL0), + HOST_HANDLED(SYS_CNTP_CVAL_EL0), + + /* Performance Monitoring Registers are restricted. */ +}; + +/* + * Initializes feature registers for protected vms. + */ +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_arch *ka = &kvm->arch; + u32 r; + + hyp_assert_lock_held(&vm_table_lock); + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + /* + * Initialize only AArch64 id registers since AArch32 isn't supported + * for protected VMs. + */ + for (r = sys_reg(3, 0, 0, 4, 0); r <= sys_reg(3, 0, 0, 7, 7); r += sys_reg(0, 0, 0, 0, 1)) + ka->id_regs[IDREG_IDX(r)] = pvm_calc_id_reg(vcpu, r); + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); +} + +/* + * Checks that the sysreg table is unique and in-order. + * + * Returns 0 if the table is consistent, or 1 otherwise. + */ +int kvm_check_pvm_sysreg_table(void) +{ + unsigned int i; + + for (i = 1; i < ARRAY_SIZE(pvm_sys_reg_descs); i++) { + if (cmp_sys_reg(&pvm_sys_reg_descs[i-1], &pvm_sys_reg_descs[i]) >= 0) + return 1; + } + + return 0; +} + +/* + * Handler for protected VM MSR, MRS or System instruction execution. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't, to be handled by the host. + */ +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const struct sys_reg_desc *r; + struct sys_reg_params params; + unsigned long esr = kvm_vcpu_get_esr(vcpu); + int Rt = kvm_vcpu_sys_get_rt(vcpu); + + params = esr_sys64_to_params(esr); + params.regval = vcpu_get_reg(vcpu, Rt); + + r = find_reg(¶ms, pvm_sys_reg_descs, ARRAY_SIZE(pvm_sys_reg_descs)); + + /* Undefined (RESTRICTED). */ + if (r == NULL) { + inject_undef64(vcpu); + return true; + } + + /* Handled by the host (HOST_HANDLED) */ + if (r->access == NULL) + return false; + + /* Handled by hyp: skip instruction if instructed to do so. */ + if (r->access(vcpu, ¶ms, r)) + __kvm_skip_instr(vcpu); + + if (!params.is_write) + vcpu_set_reg(vcpu, Rt, params.regval); + + return true; +} + +/* + * Handler for protected VM restricted exceptions. + * + * Inject an undefined exception into the guest and return true to indicate that + * the hypervisor has handled the exit, and control should go back to the guest. + */ +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + inject_undef64(vcpu); + return true; +} diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c new file mode 100644 index 000000000000..3cc613cce5f5 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <hyp/sysreg-sr.h> + +#include <linux/compiler.h> +#include <linux/kvm_host.h> + +#include <asm/kprobes.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> + +/* + * Non-VHE: Both host and guest must save everything. + */ + +void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt) +{ + __sysreg_save_el1_state(ctxt); + __sysreg_save_common_state(ctxt); + __sysreg_save_user_state(ctxt); + __sysreg_save_el2_return_state(ctxt); +} + +void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) +{ + u64 midr = ctxt_midr_el1(ctxt); + + __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1)); + __sysreg_restore_common_state(ctxt); + __sysreg_restore_user_state(ctxt); + __sysreg_restore_el2_return_state(ctxt); +} diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c new file mode 100644 index 000000000000..ff176f4ce7de --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <clocksource/arm_arch_timer.h> +#include <linux/compiler.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +void __kvm_timer_set_cntvoff(u64 cntvoff) +{ + write_sysreg(cntvoff, cntvoff_el2); +} + +/* + * Should only be called on non-VHE or hVHE setups. + * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). + */ +void __timer_disable_traps(struct kvm_vcpu *vcpu) +{ + u64 set, clr, shift = 0; + + if (has_hvhe()) + shift = 10; + + /* Allow physical timer/counter access for the host */ + set = (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; + clr = CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + + sysreg_clear_set(cnthctl_el2, clr, set); +} + +/* + * Should only be called on non-VHE or hVHE setups. + * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). + */ +void __timer_enable_traps(struct kvm_vcpu *vcpu) +{ + u64 clr = 0, set = 0; + + /* + * Disallow physical timer access for the guest + * Physical counter access is allowed if no offset is enforced + * or running protected (we don't offset anything in this case). + */ + clr = CNTHCTL_EL1PCEN; + if (is_protected_kvm_enabled() || + !kern_hyp_va(vcpu->kvm)->arch.timer_data.poffset) + set |= CNTHCTL_EL1PCTEN; + else + clr |= CNTHCTL_EL1PCTEN; + + if (has_hvhe()) { + clr <<= 10; + set <<= 10; + } + + /* + * Trap the virtual counter/timer if we have a broken cntvoff + * implementation. + */ + if (has_broken_cntvoff()) + set |= CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + + sysreg_clear_set(cnthctl_el2, clr, set); +} diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c new file mode 100644 index 000000000000..48da9ca9763f --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/tlbflush.h> + +#include <nvhe/mem_protect.h> + +struct tlb_inv_context { + struct kvm_s2_mmu *mmu; + u64 tcr; + u64 sctlr; +}; + +static void enter_vmid_context(struct kvm_s2_mmu *mmu, + struct tlb_inv_context *cxt, + bool nsh) +{ + struct kvm_s2_mmu *host_s2_mmu = &host_mmu.arch.mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + cxt->mmu = NULL; + + /* + * We have two requirements: + * + * - ensure that the page table updates are visible to all + * CPUs, for which a dsb(DOMAIN-st) is what we need, DOMAIN + * being either ish or nsh, depending on the invalidation + * type. + * + * - complete any speculative page table walk started before + * we trapped to EL2 so that we can mess with the MM + * registers out of context, for which dsb(nsh) is enough + * + * The composition of these two barriers is a dsb(DOMAIN), and + * the 'nsh' parameter tracks the distinction between + * Inner-Shareable and Non-Shareable, as specified by the + * callers. + */ + if (nsh) + dsb(nsh); + else + dsb(ish); + + /* + * If we're already in the desired context, then there's nothing to do. + */ + if (vcpu) { + /* + * We're in guest context. However, for this to work, this needs + * to be called from within __kvm_vcpu_run(), which ensures that + * __hyp_running_vcpu is set to the current guest vcpu. + */ + if (mmu == vcpu->arch.hw_mmu || WARN_ON(mmu != host_s2_mmu)) + return; + + cxt->mmu = vcpu->arch.hw_mmu; + } else { + /* We're in host context. */ + if (mmu == host_s2_mmu) + return; + + cxt->mmu = host_s2_mmu; + } + + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { + u64 val; + + /* + * For CPUs that are affected by ARM 1319367, we need to + * avoid a Stage-1 walk with the old VMID while we have + * the new VMID set in the VTTBR in order to invalidate TLBs. + * We're guaranteed that the host S1 MMU is enabled, so + * we can simply set the EPD bits to avoid any further + * TLB fill. For guests, we ensure that the S1 MMU is + * temporarily enabled in the next context. + */ + val = cxt->tcr = read_sysreg_el1(SYS_TCR); + val |= TCR_EPD1_MASK | TCR_EPD0_MASK; + write_sysreg_el1(val, SYS_TCR); + isb(); + + if (vcpu) { + val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR); + if (!(val & SCTLR_ELx_M)) { + val |= SCTLR_ELx_M; + write_sysreg_el1(val, SYS_SCTLR); + isb(); + } + } else { + /* The host S1 MMU is always enabled. */ + cxt->sctlr = SCTLR_ELx_M; + } + } + + /* + * __load_stage2() includes an ISB only when the AT + * workaround is applied. Take care of the opposite condition, + * ensuring that we always have an ISB, but not two ISBs back + * to back. + */ + if (vcpu) + __load_host_stage2(); + else + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); +} + +static void exit_vmid_context(struct tlb_inv_context *cxt) +{ + struct kvm_s2_mmu *mmu = cxt->mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + + if (!mmu) + return; + + if (vcpu) + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + else + __load_host_stage2(); + + /* Ensure write of the old VMID */ + isb(); + + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { + if (!(cxt->sctlr & SCTLR_ELx_M)) { + write_sysreg_el1(cxt->sctlr, SYS_SCTLR); + isb(); + } + + write_sysreg_el1(cxt->tcr, SYS_TCR); + } +} + +void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt, false); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1is, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(ish); + __tlbi(vmalle1is); + dsb(ish); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt, true); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, + phys_addr_t start, unsigned long pages) +{ + struct tlb_inv_context cxt; + unsigned long stride; + + /* + * Since the range of addresses may not be mapped at + * the same level, assume the worst case as PAGE_SIZE + */ + stride = PAGE_SIZE; + start = round_down(start, stride); + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt, false); + + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); + + dsb(ish); + __tlbi(vmalle1is); + dsb(ish); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) +{ + struct tlb_inv_context cxt; + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt, false); + + __tlbi(vmalls12e1is); + dsb(ish); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) +{ + struct tlb_inv_context cxt; + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt, false); + + __tlbi(vmalle1); + asm volatile("ic iallu"); + dsb(nsh); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_flush_vm_context(void) +{ + /* Same remark as in enter_vmid_context() */ + dsb(ish); + __tlbi(alle1is); + dsb(ish); +} diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c new file mode 100644 index 000000000000..947ac1a951a5 --- /dev/null +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -0,0 +1,1679 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. + * No bombay mix was harmed in the writing of this file. + * + * Copyright (C) 2020 Google LLC + * Author: Will Deacon <will@kernel.org> + */ + +#include <linux/bitfield.h> +#include <asm/kvm_pgtable.h> +#include <asm/stage2_pgtable.h> + +struct kvm_pgtable_walk_data { + struct kvm_pgtable_walker *walker; + + const u64 start; + u64 addr; + const u64 end; +}; + +static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) +{ + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); +} + +static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) +{ + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); +} + +static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) +{ + u64 granule = kvm_granule_size(ctx->level); + + if (!kvm_level_supports_block_mapping(ctx->level)) + return false; + + if (granule > (ctx->end - ctx->addr)) + return false; + + if (!IS_ALIGNED(phys, granule)) + return false; + + return IS_ALIGNED(ctx->addr, granule); +} + +static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level) +{ + u64 shift = kvm_granule_shift(level); + u64 mask = BIT(PAGE_SHIFT - 3) - 1; + + return (data->addr >> shift) & mask; +} + +static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) +{ + u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ + u64 mask = BIT(pgt->ia_bits) - 1; + + return (addr & mask) >> shift; +} + +static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level) +{ + struct kvm_pgtable pgt = { + .ia_bits = ia_bits, + .start_level = start_level, + }; + + return kvm_pgd_page_idx(&pgt, -1ULL) + 1; +} + +static bool kvm_pte_table(kvm_pte_t pte, s8 level) +{ + if (level == KVM_PGTABLE_LAST_LEVEL) + return false; + + if (!kvm_pte_valid(pte)) + return false; + + return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; +} + +static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) +{ + return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); +} + +static void kvm_clear_pte(kvm_pte_t *ptep) +{ + WRITE_ONCE(*ptep, 0); +} + +static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) +{ + kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); + + pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); + pte |= KVM_PTE_VALID; + return pte; +} + +static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) +{ + kvm_pte_t pte = kvm_phys_to_pte(pa); + u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE : + KVM_PTE_TYPE_BLOCK; + + pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); + pte |= FIELD_PREP(KVM_PTE_TYPE, type); + pte |= KVM_PTE_VALID; + + return pte; +} + +static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) +{ + return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); +} + +static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, + const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct kvm_pgtable_walker *walker = data->walker; + + /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ + WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); + return walker->cb(ctx, visit); +} + +static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, + int r) +{ + /* + * Visitor callbacks return EAGAIN when the conditions that led to a + * fault are no longer reflected in the page tables due to a race to + * update a PTE. In the context of a fault handler this is interpreted + * as a signal to retry guest execution. + * + * Ignore the return code altogether for walkers outside a fault handler + * (e.g. write protecting a range of memory) and chug along with the + * page table walk. + */ + if (r == -EAGAIN) + return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); + + return !r; +} + +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level); + +static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, + struct kvm_pgtable_mm_ops *mm_ops, + kvm_pteref_t pteref, s8 level) +{ + enum kvm_pgtable_walk_flags flags = data->walker->flags; + kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); + struct kvm_pgtable_visit_ctx ctx = { + .ptep = ptep, + .old = READ_ONCE(*ptep), + .arg = data->walker->arg, + .mm_ops = mm_ops, + .start = data->start, + .addr = data->addr, + .end = data->end, + .level = level, + .flags = flags, + }; + int ret = 0; + bool reload = false; + kvm_pteref_t childp; + bool table = kvm_pte_table(ctx.old, level); + + if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) { + ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); + reload = true; + } + + if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { + ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); + reload = true; + } + + /* + * Reload the page table after invoking the walker callback for leaf + * entries or after pre-order traversal, to allow the walker to descend + * into a newly installed or replaced table. + */ + if (reload) { + ctx.old = READ_ONCE(*ptep); + table = kvm_pte_table(ctx.old, level); + } + + if (!kvm_pgtable_walk_continue(data->walker, ret)) + goto out; + + if (!table) { + data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); + data->addr += kvm_granule_size(level); + goto out; + } + + childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); + ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); + if (!kvm_pgtable_walk_continue(data->walker, ret)) + goto out; + + if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) + ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); + +out: + if (kvm_pgtable_walk_continue(data->walker, ret)) + return 0; + + return ret; +} + +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level) +{ + u32 idx; + int ret = 0; + + if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL || + level > KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { + kvm_pteref_t pteref = &pgtable[idx]; + + if (data->addr >= data->end) + break; + + ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); + if (ret) + break; + } + + return ret; +} + +static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) +{ + u32 idx; + int ret = 0; + u64 limit = BIT(pgt->ia_bits); + + if (data->addr > limit || data->end > limit) + return -ERANGE; + + if (!pgt->pgd) + return -EINVAL; + + for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { + kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; + + ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); + if (ret) + break; + } + + return ret; +} + +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_pgtable_walker *walker) +{ + struct kvm_pgtable_walk_data walk_data = { + .start = ALIGN_DOWN(addr, PAGE_SIZE), + .addr = ALIGN_DOWN(addr, PAGE_SIZE), + .end = PAGE_ALIGN(walk_data.addr + size), + .walker = walker, + }; + int r; + + r = kvm_pgtable_walk_begin(walker); + if (r) + return r; + + r = _kvm_pgtable_walk(pgt, &walk_data); + kvm_pgtable_walk_end(walker); + + return r; +} + +struct leaf_walk_data { + kvm_pte_t pte; + s8 level; +}; + +static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct leaf_walk_data *data = ctx->arg; + + data->pte = ctx->old; + data->level = ctx->level; + + return 0; +} + +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, + kvm_pte_t *ptep, s8 *level) +{ + struct leaf_walk_data data; + struct kvm_pgtable_walker walker = { + .cb = leaf_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &data, + }; + int ret; + + ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), + PAGE_SIZE, &walker); + if (!ret) { + if (ptep) + *ptep = data.pte; + if (level) + *level = data.level; + } + + return ret; +} + +struct hyp_map_data { + const u64 phys; + kvm_pte_t attr; +}; + +static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) +{ + bool device = prot & KVM_PGTABLE_PROT_DEVICE; + u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; + kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); + u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; + u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : + KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; + + if (!(prot & KVM_PGTABLE_PROT_R)) + return -EINVAL; + + if (prot & KVM_PGTABLE_PROT_X) { + if (prot & KVM_PGTABLE_PROT_W) + return -EINVAL; + + if (device) + return -EINVAL; + + if (system_supports_bti_kernel()) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; + } else { + attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; + } + + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; + *ptep = attr; + + return 0; +} + +enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + u32 ap; + + if (!kvm_pte_valid(pte)) + return prot; + + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) + prot |= KVM_PGTABLE_PROT_X; + + ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); + if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) + prot |= KVM_PGTABLE_PROT_R; + else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) + prot |= KVM_PGTABLE_PROT_RW; + + return prot; +} + +static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, + struct hyp_map_data *data) +{ + u64 phys = data->phys + (ctx->addr - ctx->start); + kvm_pte_t new; + + if (!kvm_block_mapping_supported(ctx, phys)) + return false; + + new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); + if (ctx->old == new) + return true; + if (!kvm_pte_valid(ctx->old)) + ctx->mm_ops->get_page(ctx->ptep); + else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) + return false; + + smp_store_release(ctx->ptep, new); + return true; +} + +static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + kvm_pte_t *childp, new; + struct hyp_map_data *data = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + if (hyp_map_walker_try_leaf(ctx, data)) + return 0; + + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); + if (!childp) + return -ENOMEM; + + new = kvm_init_table_pte(childp, mm_ops); + mm_ops->get_page(ctx->ptep); + smp_store_release(ctx->ptep, new); + + return 0; +} + +int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, + enum kvm_pgtable_prot prot) +{ + int ret; + struct hyp_map_data map_data = { + .phys = ALIGN_DOWN(phys, PAGE_SIZE), + }; + struct kvm_pgtable_walker walker = { + .cb = hyp_map_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &map_data, + }; + + ret = hyp_set_prot_attr(prot, &map_data.attr); + if (ret) + return ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + isb(); + return ret; +} + +static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + kvm_pte_t *childp = NULL; + u64 granule = kvm_granule_size(ctx->level); + u64 *unmapped = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + if (!kvm_pte_valid(ctx->old)) + return -EINVAL; + + if (kvm_pte_table(ctx->old, ctx->level)) { + childp = kvm_pte_follow(ctx->old, mm_ops); + + if (mm_ops->page_count(childp) != 1) + return 0; + + kvm_clear_pte(ctx->ptep); + dsb(ishst); + __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); + } else { + if (ctx->end - ctx->addr < granule) + return -EINVAL; + + kvm_clear_pte(ctx->ptep); + dsb(ishst); + __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + *unmapped += granule; + } + + dsb(ish); + isb(); + mm_ops->put_page(ctx->ptep); + + if (childp) + mm_ops->put_page(childp); + + return 0; +} + +u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + u64 unmapped = 0; + struct kvm_pgtable_walker walker = { + .cb = hyp_unmap_walker, + .arg = &unmapped, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + }; + + if (!pgt->mm_ops->page_count) + return 0; + + kvm_pgtable_walk(pgt, addr, size, &walker); + return unmapped; +} + +int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, + struct kvm_pgtable_mm_ops *mm_ops) +{ + s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 - + ARM64_HW_PGTABLE_LEVELS(va_bits); + + if (start_level < KVM_PGTABLE_FIRST_LEVEL || + start_level > KVM_PGTABLE_LAST_LEVEL) + return -EINVAL; + + pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); + if (!pgt->pgd) + return -ENOMEM; + + pgt->ia_bits = va_bits; + pgt->start_level = start_level; + pgt->mm_ops = mm_ops; + pgt->mmu = NULL; + pgt->force_pte_cb = NULL; + + return 0; +} + +static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + if (!kvm_pte_valid(ctx->old)) + return 0; + + mm_ops->put_page(ctx->ptep); + + if (kvm_pte_table(ctx->old, ctx->level)) + mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); + + return 0; +} + +void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) +{ + struct kvm_pgtable_walker walker = { + .cb = hyp_free_walker, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + }; + + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); + pgt->pgd = NULL; +} + +struct stage2_map_data { + const u64 phys; + kvm_pte_t attr; + u8 owner_id; + + kvm_pte_t *anchor; + kvm_pte_t *childp; + + struct kvm_s2_mmu *mmu; + void *memcache; + + /* Force mappings to page granularity */ + bool force_pte; + + /* Walk should update owner_id only */ + bool annotation; +}; + +u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) +{ + u64 vtcr = VTCR_EL2_FLAGS; + s8 lvls; + + vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; + vtcr |= VTCR_EL2_T0SZ(phys_shift); + /* + * Use a minimum 2 level page table to prevent splitting + * host PMD huge pages at stage2. + */ + lvls = stage2_pgtable_levels(phys_shift); + if (lvls < 2) + lvls = 2; + + /* + * When LPA2 is enabled, the HW supports an extra level of translation + * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2 + * to as an addition to SL0 to enable encoding this extra start level. + * However, since we always use concatenated pages for the first level + * lookup, we will never need this extra level and therefore do not need + * to touch SL2. + */ + vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); + +#ifdef CONFIG_ARM64_HW_AFDBM + /* + * Enable the Hardware Access Flag management, unconditionally + * on all CPUs. In systems that have asymmetric support for the feature + * this allows KVM to leverage hardware support on the subset of cores + * that implement the feature. + * + * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by + * hardware) on implementations that do not advertise support for the + * feature. As such, setting HA unconditionally is safe, unless you + * happen to be running on a design that has unadvertised support for + * HAFDBS. Here be dragons. + */ + if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + vtcr |= VTCR_EL2_HA; +#endif /* CONFIG_ARM64_HW_AFDBM */ + + if (kvm_lpa2_is_enabled()) + vtcr |= VTCR_EL2_DS; + + /* Set the vmid bits */ + vtcr |= (get_vmid_bits(mmfr1) == 16) ? + VTCR_EL2_VS_16BIT : + VTCR_EL2_VS_8BIT; + + return vtcr; +} + +static bool stage2_has_fwb(struct kvm_pgtable *pgt) +{ + if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + return false; + + return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); +} + +void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, + phys_addr_t addr, size_t size) +{ + unsigned long pages, inval_pages; + + if (!system_supports_tlb_range()) { + kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); + return; + } + + pages = size >> PAGE_SHIFT; + while (pages > 0) { + inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); + kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages); + + addr += inval_pages << PAGE_SHIFT; + pages -= inval_pages; + } +} + +#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) + +static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr) +{ + bool px, ux; + u8 xn; + + px = prot & KVM_PGTABLE_PROT_PX; + ux = prot & KVM_PGTABLE_PROT_UX; + + if (!cpus_have_final_cap(ARM64_HAS_XNX) && px != ux) + return -EINVAL; + + if (px && ux) + xn = 0b00; + else if (!px && ux) + xn = 0b01; + else if (!px && !ux) + xn = 0b10; + else + xn = 0b11; + + *attr &= ~KVM_PTE_LEAF_ATTR_HI_S2_XN; + *attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, xn); + return 0; +} + +static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, + kvm_pte_t *ptep) +{ + kvm_pte_t attr; + u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + int r; + + switch (prot & (KVM_PGTABLE_PROT_DEVICE | + KVM_PGTABLE_PROT_NORMAL_NC)) { + case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: + return -EINVAL; + case KVM_PGTABLE_PROT_DEVICE: + if (prot & KVM_PGTABLE_PROT_X) + return -EINVAL; + attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); + break; + case KVM_PGTABLE_PROT_NORMAL_NC: + if (prot & KVM_PGTABLE_PROT_X) + return -EINVAL; + attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); + break; + default: + attr = KVM_S2_MEMATTR(pgt, NORMAL); + } + + r = stage2_set_xn_attr(prot, &attr); + if (r) + return r; + + if (prot & KVM_PGTABLE_PROT_R) + attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + + if (prot & KVM_PGTABLE_PROT_W) + attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + + attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; + *ptep = attr; + + return 0; +} + +enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + + if (!kvm_pte_valid(pte)) + return prot; + + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) + prot |= KVM_PGTABLE_PROT_R; + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) + prot |= KVM_PGTABLE_PROT_W; + + switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) { + case 0b00: + prot |= KVM_PGTABLE_PROT_PX | KVM_PGTABLE_PROT_UX; + break; + case 0b01: + prot |= KVM_PGTABLE_PROT_UX; + break; + case 0b11: + prot |= KVM_PGTABLE_PROT_PX; + break; + default: + break; + } + + return prot; +} + +static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) +{ + if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) + return true; + + return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); +} + +static bool stage2_pte_is_counted(kvm_pte_t pte) +{ + /* + * The refcount tracks valid entries as well as invalid entries if they + * encode ownership of a page to another entity than the page-table + * owner, whose id is 0. + */ + return !!pte; +} + +static bool stage2_pte_is_locked(kvm_pte_t pte) +{ + return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); +} + +static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) +{ + if (!kvm_pgtable_walk_shared(ctx)) { + WRITE_ONCE(*ctx->ptep, new); + return true; + } + + return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; +} + +/** + * stage2_try_break_pte() - Invalidates a pte according to the + * 'break-before-make' requirements of the + * architecture. + * + * @ctx: context of the visited pte. + * @mmu: stage-2 mmu + * + * Returns: true if the pte was successfully broken. + * + * If the removed pte was valid, performs the necessary serialization and TLB + * invalidation for the old value. For counted ptes, drops the reference count + * on the containing table page. + */ +static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, + struct kvm_s2_mmu *mmu) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + if (stage2_pte_is_locked(ctx->old)) { + /* + * Should never occur if this walker has exclusive access to the + * page tables. + */ + WARN_ON(!kvm_pgtable_walk_shared(ctx)); + return false; + } + + if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) + return false; + + if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { + /* + * Perform the appropriate TLB invalidation based on the + * evicted pte value (if any). + */ + if (kvm_pte_table(ctx->old, ctx->level)) { + u64 size = kvm_granule_size(ctx->level); + u64 addr = ALIGN_DOWN(ctx->addr, size); + + kvm_tlb_flush_vmid_range(mmu, addr, size); + } else if (kvm_pte_valid(ctx->old)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, + ctx->addr, ctx->level); + } + } + + if (stage2_pte_is_counted(ctx->old)) + mm_ops->put_page(ctx->ptep); + + return true; +} + +static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); + + if (stage2_pte_is_counted(new)) + mm_ops->get_page(ctx->ptep); + + smp_store_release(ctx->ptep, new); +} + +static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) +{ + /* + * If FEAT_TLBIRANGE is implemented, defer the individual + * TLB invalidations until the entire walk is finished, and + * then use the range-based TLBI instructions to do the + * invalidations. Condition deferred TLB invalidation on the + * system supporting FWB as the optimization is entirely + * pointless when the unmap walker needs to perform CMOs. + */ + return system_supports_tlb_range() && stage2_has_fwb(pgt); +} + +static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, + struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) +{ + struct kvm_pgtable *pgt = ctx->arg; + + /* + * Clear the existing PTE, and perform break-before-make if it was + * valid. Depending on the system support, defer the TLB maintenance + * for the same until the entire unmap walk is completed. + */ + if (kvm_pte_valid(ctx->old)) { + kvm_clear_pte(ctx->ptep); + + if (kvm_pte_table(ctx->old, ctx->level)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + TLBI_TTL_UNKNOWN); + } else if (!stage2_unmap_defer_tlb_flush(pgt)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + ctx->level); + } + } + + mm_ops->put_page(ctx->ptep); +} + +static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) +{ + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; + return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL); +} + +static bool stage2_pte_executable(kvm_pte_t pte) +{ + return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); +} + +static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, + const struct stage2_map_data *data) +{ + u64 phys = data->phys; + + /* Work out the correct PA based on how far the walk has gotten */ + return phys + (ctx->addr - ctx->start); +} + +static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, + struct stage2_map_data *data) +{ + u64 phys = stage2_map_walker_phys_addr(ctx, data); + + if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) + return false; + + if (data->annotation) + return true; + + return kvm_block_mapping_supported(ctx, phys); +} + +static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, + struct stage2_map_data *data) +{ + kvm_pte_t new; + u64 phys = stage2_map_walker_phys_addr(ctx, data); + u64 granule = kvm_granule_size(ctx->level); + struct kvm_pgtable *pgt = data->mmu->pgt; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + if (!stage2_leaf_mapping_allowed(ctx, data)) + return -E2BIG; + + if (!data->annotation) + new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); + else + new = kvm_init_invalid_leaf_owner(data->owner_id); + + /* + * Skip updating the PTE if we are trying to recreate the exact + * same mapping or only change the access permissions. Instead, + * the vCPU will exit one more time from guest if still needed + * and then go through the path of relaxing permissions. + */ + if (!stage2_pte_needs_update(ctx->old, new)) + return -EAGAIN; + + /* If we're only changing software bits, then store them and go! */ + if (!kvm_pgtable_walk_shared(ctx) && + !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) { + bool old_is_counted = stage2_pte_is_counted(ctx->old); + + if (old_is_counted != stage2_pte_is_counted(new)) { + if (old_is_counted) + mm_ops->put_page(ctx->ptep); + else + mm_ops->get_page(ctx->ptep); + } + WARN_ON_ONCE(!stage2_try_set_pte(ctx, new)); + return 0; + } + + if (!stage2_try_break_pte(ctx, data->mmu)) + return -EAGAIN; + + /* Perform CMOs before installation of the guest stage-2 PTE */ + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && + stage2_pte_cacheable(pgt, new)) + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), + granule); + + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && + stage2_pte_executable(new)) + mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); + + stage2_make_pte(ctx, new); + + return 0; +} + +static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, + struct stage2_map_data *data) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + int ret; + + if (!stage2_leaf_mapping_allowed(ctx, data)) + return 0; + + ret = stage2_map_walker_try_leaf(ctx, data); + if (ret) + return ret; + + mm_ops->free_unlinked_table(childp, ctx->level); + return 0; +} + +static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, + struct stage2_map_data *data) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp, new; + int ret; + + ret = stage2_map_walker_try_leaf(ctx, data); + if (ret != -E2BIG) + return ret; + + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + if (!data->memcache) + return -ENOMEM; + + childp = mm_ops->zalloc_page(data->memcache); + if (!childp) + return -ENOMEM; + + if (!stage2_try_break_pte(ctx, data->mmu)) { + mm_ops->put_page(childp); + return -EAGAIN; + } + + /* + * If we've run into an existing block mapping then replace it with + * a table. Accesses beyond 'end' that fall within the new table + * will be mapped lazily. + */ + new = kvm_init_table_pte(childp, mm_ops); + stage2_make_pte(ctx, new); + + return 0; +} + +/* + * The TABLE_PRE callback runs for table entries on the way down, looking + * for table entries which we could conceivably replace with a block entry + * for this mapping. If it finds one it replaces the entry and calls + * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. + * + * Otherwise, the LEAF callback performs the mapping at the existing leaves + * instead. + */ +static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct stage2_map_data *data = ctx->arg; + + switch (visit) { + case KVM_PGTABLE_WALK_TABLE_PRE: + return stage2_map_walk_table_pre(ctx, data); + case KVM_PGTABLE_WALK_LEAF: + return stage2_map_walk_leaf(ctx, data); + default: + return -EINVAL; + } +} + +int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + void *mc, enum kvm_pgtable_walk_flags flags) +{ + int ret; + struct stage2_map_data map_data = { + .phys = ALIGN_DOWN(phys, PAGE_SIZE), + .mmu = pgt->mmu, + .memcache = mc, + .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = flags | + KVM_PGTABLE_WALK_TABLE_PRE | + KVM_PGTABLE_WALK_LEAF, + .arg = &map_data, + }; + + if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) + return -EINVAL; + + ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); + if (ret) + return ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; +} + +int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, + void *mc, u8 owner_id) +{ + int ret; + struct stage2_map_data map_data = { + .mmu = pgt->mmu, + .memcache = mc, + .owner_id = owner_id, + .force_pte = true, + .annotation = true, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_TABLE_PRE | + KVM_PGTABLE_WALK_LEAF, + .arg = &map_data, + }; + + if (owner_id > KVM_MAX_OWNER_ID) + return -EINVAL; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + return ret; +} + +static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct kvm_pgtable *pgt = ctx->arg; + struct kvm_s2_mmu *mmu = pgt->mmu; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = NULL; + bool need_flush = false; + + if (!kvm_pte_valid(ctx->old)) { + if (stage2_pte_is_counted(ctx->old)) { + kvm_clear_pte(ctx->ptep); + mm_ops->put_page(ctx->ptep); + } + return 0; + } + + if (kvm_pte_table(ctx->old, ctx->level)) { + childp = kvm_pte_follow(ctx->old, mm_ops); + + if (mm_ops->page_count(childp) != 1) + return 0; + } else if (stage2_pte_cacheable(pgt, ctx->old)) { + need_flush = !stage2_has_fwb(pgt); + } + + /* + * This is similar to the map() path in that we unmap the entire + * block entry and rely on the remaining portions being faulted + * back lazily. + */ + stage2_unmap_put_pte(ctx, mmu, mm_ops); + + if (need_flush && mm_ops->dcache_clean_inval_poc) + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), + kvm_granule_size(ctx->level)); + + if (childp) + mm_ops->put_page(childp); + + return 0; +} + +int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + int ret; + struct kvm_pgtable_walker walker = { + .cb = stage2_unmap_walker, + .arg = pgt, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + }; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + if (stage2_unmap_defer_tlb_flush(pgt)) + /* Perform the deferred TLB invalidations */ + kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); + + return ret; +} + +struct stage2_attr_data { + kvm_pte_t attr_set; + kvm_pte_t attr_clr; + kvm_pte_t pte; + s8 level; +}; + +static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + kvm_pte_t pte = ctx->old; + struct stage2_attr_data *data = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + if (!kvm_pte_valid(ctx->old)) + return -EAGAIN; + + data->level = ctx->level; + data->pte = pte; + pte &= ~data->attr_clr; + pte |= data->attr_set; + + /* + * We may race with the CPU trying to set the access flag here, + * but worst-case the access flag update gets lost and will be + * set on the next access instead. + */ + if (data->pte != pte) { + /* + * Invalidate instruction cache before updating the guest + * stage-2 PTE if we are going to add executable permission. + */ + if (mm_ops->icache_inval_pou && + stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) + mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), + kvm_granule_size(ctx->level)); + + if (!stage2_try_set_pte(ctx, pte)) + return -EAGAIN; + } + + return 0; +} + +static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, + u64 size, kvm_pte_t attr_set, + kvm_pte_t attr_clr, kvm_pte_t *orig_pte, + s8 *level, enum kvm_pgtable_walk_flags flags) +{ + int ret; + kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; + struct stage2_attr_data data = { + .attr_set = attr_set & attr_mask, + .attr_clr = attr_clr & attr_mask, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_attr_walker, + .arg = &data, + .flags = flags | KVM_PGTABLE_WALK_LEAF, + }; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + if (ret) + return ret; + + if (orig_pte) + *orig_pte = data.pte; + + if (level) + *level = data.level; + return 0; +} + +int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + return stage2_update_leaf_attrs(pgt, addr, size, 0, + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, + NULL, NULL, 0); +} + +void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) +{ + int ret; + + ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, + NULL, NULL, flags); + if (!ret) + dsb(ishst); +} + +struct stage2_age_data { + bool mkold; + bool young; +}; + +static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; + struct stage2_age_data *data = ctx->arg; + + if (!kvm_pte_valid(ctx->old) || new == ctx->old) + return 0; + + data->young = true; + + /* + * stage2_age_walker() is always called while holding the MMU lock for + * write, so this will always succeed. Nonetheless, this deliberately + * follows the race detection pattern of the other stage-2 walkers in + * case the locking mechanics of the MMU notifiers is ever changed. + */ + if (data->mkold && !stage2_try_set_pte(ctx, new)) + return -EAGAIN; + + /* + * "But where's the TLBI?!", you scream. + * "Over in the core code", I sigh. + * + * See the '->clear_flush_young()' callback on the KVM mmu notifier. + */ + return 0; +} + +bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, + u64 size, bool mkold) +{ + struct stage2_age_data data = { + .mkold = mkold, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_age_walker, + .arg = &data, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); + return data.young; +} + +int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) +{ + kvm_pte_t xn = 0, set = 0, clr = 0; + s8 level; + int ret; + + if (prot & KVM_PTE_LEAF_ATTR_HI_SW) + return -EINVAL; + + if (prot & KVM_PGTABLE_PROT_R) + set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + + if (prot & KVM_PGTABLE_PROT_W) + set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + + ret = stage2_set_xn_attr(prot, &xn); + if (ret) + return ret; + + set |= xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; + clr |= ~xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; + + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); + if (!ret || ret == -EAGAIN) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); + return ret; +} + +static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct kvm_pgtable *pgt = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + + if (!stage2_pte_cacheable(pgt, ctx->old)) + return 0; + + if (mm_ops->dcache_clean_inval_poc) + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), + kvm_granule_size(ctx->level)); + return 0; +} + +int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_flush_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = pgt, + }; + + if (stage2_has_fwb(pgt)) + return 0; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + +kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, + u64 phys, s8 level, + enum kvm_pgtable_prot prot, + void *mc, bool force_pte) +{ + struct stage2_map_data map_data = { + .phys = phys, + .mmu = pgt->mmu, + .memcache = mc, + .force_pte = force_pte, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_SKIP_BBM_TLBI | + KVM_PGTABLE_WALK_SKIP_CMO, + .arg = &map_data, + }; + /* + * The input address (.addr) is irrelevant for walking an + * unlinked table. Construct an ambiguous IA range to map + * kvm_granule_size(level) worth of memory. + */ + struct kvm_pgtable_walk_data data = { + .walker = &walker, + .addr = 0, + .end = kvm_granule_size(level), + }; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + kvm_pte_t *pgtable; + int ret; + + if (!IS_ALIGNED(phys, kvm_granule_size(level))) + return ERR_PTR(-EINVAL); + + ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); + if (ret) + return ERR_PTR(ret); + + pgtable = mm_ops->zalloc_page(mc); + if (!pgtable) + return ERR_PTR(-ENOMEM); + + ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, + level + 1); + if (ret) { + kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); + return ERR_PTR(ret); + } + + return pgtable; +} + +/* + * Get the number of page-tables needed to replace a block with a + * fully populated tree up to the PTE entries. Note that @level is + * interpreted as in "level @level entry". + */ +static int stage2_block_get_nr_page_tables(s8 level) +{ + switch (level) { + case 1: + return PTRS_PER_PTE + 1; + case 2: + return 1; + case 3: + return 0; + default: + WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || + level > KVM_PGTABLE_LAST_LEVEL); + return -EINVAL; + }; +} + +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + struct kvm_mmu_memory_cache *mc = ctx->arg; + struct kvm_s2_mmu *mmu; + kvm_pte_t pte = ctx->old, new, *childp; + enum kvm_pgtable_prot prot; + s8 level = ctx->level; + bool force_pte; + int nr_pages; + u64 phys; + + /* No huge-pages exist at the last level */ + if (level == KVM_PGTABLE_LAST_LEVEL) + return 0; + + /* We only split valid block mappings */ + if (!kvm_pte_valid(pte)) + return 0; + + nr_pages = stage2_block_get_nr_page_tables(level); + if (nr_pages < 0) + return nr_pages; + + if (mc->nobjs >= nr_pages) { + /* Build a tree mapped down to the PTE granularity. */ + force_pte = true; + } else { + /* + * Don't force PTEs, so create_unlinked() below does + * not populate the tree up to the PTE level. The + * consequence is that the call will require a single + * page of level 2 entries at level 1, or a single + * page of PTEs at level 2. If we are at level 1, the + * PTEs will be created recursively. + */ + force_pte = false; + nr_pages = 1; + } + + if (mc->nobjs < nr_pages) + return -ENOMEM; + + mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); + phys = kvm_pte_to_phys(pte); + prot = kvm_pgtable_stage2_pte_prot(pte); + + childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, + level, prot, mc, force_pte); + if (IS_ERR(childp)) + return PTR_ERR(childp); + + if (!stage2_try_break_pte(ctx, mmu)) { + kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); + return -EAGAIN; + } + + /* + * Note, the contents of the page table are guaranteed to be made + * visible before the new PTE is assigned because stage2_make_pte() + * writes the PTE using smp_store_release(). + */ + new = kvm_init_table_pte(childp, mm_ops); + stage2_make_pte(ctx, new); + return 0; +} + +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_split_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = mc, + }; + int ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; +} + +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags, + kvm_pgtable_force_pte_cb_t force_pte_cb) +{ + size_t pgd_sz; + u64 vtcr = mmu->vtcr; + u32 ia_bits = VTCR_EL2_IPA(vtcr); + u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + + pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; + pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); + if (!pgt->pgd) + return -ENOMEM; + + pgt->ia_bits = ia_bits; + pgt->start_level = start_level; + pgt->mm_ops = mm_ops; + pgt->mmu = mmu; + pgt->flags = flags; + pgt->force_pte_cb = force_pte_cb; + + /* Ensure zeroed PGD pages are visible to the hardware walker */ + dsb(ishst); + return 0; +} + +size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) +{ + u32 ia_bits = VTCR_EL2_IPA(vtcr); + u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + + return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; +} + +static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + mm_ops->put_page(ctx->ptep); + return 0; +} + +static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + + if (mm_ops->page_count(childp) != 1) + return 0; + + /* + * Drop references and clear the now stale PTE to avoid rewalking the + * freed page table. + */ + mm_ops->put_page(ctx->ptep); + mm_ops->put_page(childp); + kvm_clear_pte(ctx->ptep); + return 0; +} + +static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + if (!stage2_pte_is_counted(ctx->old)) + return 0; + + switch (visit) { + case KVM_PGTABLE_WALK_LEAF: + return stage2_free_leaf(ctx); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_free_table_post(ctx); + default: + return -EINVAL; + } +} + +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_free_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + }; + + WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); +} + +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; + + /* + * Since the pgtable is unlinked at this point, and not shared with + * other walkers, safely deference pgd with kvm_dereference_pteref_raw() + */ + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); + pgt->pgd = NULL; +} + +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); + kvm_pgtable_stage2_destroy_pgd(pgt); +} + +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) +{ + kvm_pteref_t ptep = (kvm_pteref_t)pgtable; + struct kvm_pgtable_walker walker = { + .cb = stage2_free_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + }; + struct kvm_pgtable_walk_data data = { + .walker = &walker, + + /* + * At this point the IPA really doesn't matter, as the page + * table being traversed has already been removed from the stage + * 2. Set an appropriate range to cover the entire page table. + */ + .addr = 0, + .end = kvm_granule_size(level), + }; + + WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); + + WARN_ON(mm_ops->page_count(pgtable) != 1); + mm_ops->put_page(pgtable); +} diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c new file mode 100644 index 000000000000..5fd99763b54d --- /dev/null +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <hyp/adjust_pc.h> + +#include <linux/compiler.h> +#include <linux/irqchip/arm-gic.h> +#include <linux/kvm_host.h> +#include <linux/swab.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +static bool __is_be(struct kvm_vcpu *vcpu) +{ + if (vcpu_mode_is_32bit(vcpu)) + return !!(read_sysreg_el2(SYS_SPSR) & PSR_AA32_E_BIT); + + return !!(read_sysreg_el1(SYS_SCTLR) & SCTLR_ELx_EE); +} + +/* + * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the + * guest. + * + * @vcpu: the offending vcpu + * + * Returns: + * 1: GICV access successfully performed + * 0: Not a GICV access + * -1: Illegal GICV access successfully performed + */ +int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = kern_hyp_va(vcpu->kvm); + struct vgic_dist *vgic = &kvm->arch.vgic; + phys_addr_t fault_ipa; + void __iomem *addr; + int rd; + + /* Build the full address */ + fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + + /* If not for GICV, move on */ + if (fault_ipa < vgic->vgic_cpu_base || + fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE)) + return 0; + + /* Reject anything but a 32bit access */ + if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) { + __kvm_skip_instr(vcpu); + return -1; + } + + /* Not aligned? Don't bother */ + if (fault_ipa & 3) { + __kvm_skip_instr(vcpu); + return -1; + } + + /* Handle deactivation as a normal exit */ + if ((fault_ipa - vgic->vgic_cpu_base) >= GIC_CPU_DEACTIVATE) + return 0; + + rd = kvm_vcpu_dabt_get_rd(vcpu); + addr = kvm_vgic_global_state.vcpu_hyp_va; + addr += fault_ipa - vgic->vgic_cpu_base; + + if (kvm_vcpu_dabt_iswrite(vcpu)) { + u32 data = vcpu_get_reg(vcpu, rd); + if (__is_be(vcpu)) { + /* guest pre-swabbed data, undo this for writel() */ + data = __kvm_swab32(data); + } + writel_relaxed(data, addr); + } else { + u32 data = readl_relaxed(addr); + if (__is_be(vcpu)) { + /* guest expects swabbed data */ + data = __kvm_swab32(data); + } + vcpu_set_reg(vcpu, rd, data); + } + + __kvm_skip_instr(vcpu); + + return 1; +} diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c new file mode 100644 index 000000000000..0b670a033fd8 --- /dev/null +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -0,0 +1,1308 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <hyp/adjust_pc.h> + +#include <linux/compiler.h> +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +#include "../../vgic/vgic.h" + +#define vtr_to_max_lr_idx(v) ((v) & 0xf) +#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) +#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) + +u64 __gic_v3_get_lr(unsigned int lr) +{ + switch (lr & 0xf) { + case 0: + return read_gicreg(ICH_LR0_EL2); + case 1: + return read_gicreg(ICH_LR1_EL2); + case 2: + return read_gicreg(ICH_LR2_EL2); + case 3: + return read_gicreg(ICH_LR3_EL2); + case 4: + return read_gicreg(ICH_LR4_EL2); + case 5: + return read_gicreg(ICH_LR5_EL2); + case 6: + return read_gicreg(ICH_LR6_EL2); + case 7: + return read_gicreg(ICH_LR7_EL2); + case 8: + return read_gicreg(ICH_LR8_EL2); + case 9: + return read_gicreg(ICH_LR9_EL2); + case 10: + return read_gicreg(ICH_LR10_EL2); + case 11: + return read_gicreg(ICH_LR11_EL2); + case 12: + return read_gicreg(ICH_LR12_EL2); + case 13: + return read_gicreg(ICH_LR13_EL2); + case 14: + return read_gicreg(ICH_LR14_EL2); + case 15: + return read_gicreg(ICH_LR15_EL2); + } + + unreachable(); +} + +void __gic_v3_set_lr(u64 val, int lr) +{ + switch (lr & 0xf) { + case 0: + write_gicreg(val, ICH_LR0_EL2); + break; + case 1: + write_gicreg(val, ICH_LR1_EL2); + break; + case 2: + write_gicreg(val, ICH_LR2_EL2); + break; + case 3: + write_gicreg(val, ICH_LR3_EL2); + break; + case 4: + write_gicreg(val, ICH_LR4_EL2); + break; + case 5: + write_gicreg(val, ICH_LR5_EL2); + break; + case 6: + write_gicreg(val, ICH_LR6_EL2); + break; + case 7: + write_gicreg(val, ICH_LR7_EL2); + break; + case 8: + write_gicreg(val, ICH_LR8_EL2); + break; + case 9: + write_gicreg(val, ICH_LR9_EL2); + break; + case 10: + write_gicreg(val, ICH_LR10_EL2); + break; + case 11: + write_gicreg(val, ICH_LR11_EL2); + break; + case 12: + write_gicreg(val, ICH_LR12_EL2); + break; + case 13: + write_gicreg(val, ICH_LR13_EL2); + break; + case 14: + write_gicreg(val, ICH_LR14_EL2); + break; + case 15: + write_gicreg(val, ICH_LR15_EL2); + break; + } +} + +static void __vgic_v3_write_ap0rn(u32 val, int n) +{ + switch (n) { + case 0: + write_gicreg(val, ICH_AP0R0_EL2); + break; + case 1: + write_gicreg(val, ICH_AP0R1_EL2); + break; + case 2: + write_gicreg(val, ICH_AP0R2_EL2); + break; + case 3: + write_gicreg(val, ICH_AP0R3_EL2); + break; + } +} + +static void __vgic_v3_write_ap1rn(u32 val, int n) +{ + switch (n) { + case 0: + write_gicreg(val, ICH_AP1R0_EL2); + break; + case 1: + write_gicreg(val, ICH_AP1R1_EL2); + break; + case 2: + write_gicreg(val, ICH_AP1R2_EL2); + break; + case 3: + write_gicreg(val, ICH_AP1R3_EL2); + break; + } +} + +static u32 __vgic_v3_read_ap0rn(int n) +{ + u32 val; + + switch (n) { + case 0: + val = read_gicreg(ICH_AP0R0_EL2); + break; + case 1: + val = read_gicreg(ICH_AP0R1_EL2); + break; + case 2: + val = read_gicreg(ICH_AP0R2_EL2); + break; + case 3: + val = read_gicreg(ICH_AP0R3_EL2); + break; + default: + unreachable(); + } + + return val; +} + +static u32 __vgic_v3_read_ap1rn(int n) +{ + u32 val; + + switch (n) { + case 0: + val = read_gicreg(ICH_AP1R0_EL2); + break; + case 1: + val = read_gicreg(ICH_AP1R1_EL2); + break; + case 2: + val = read_gicreg(ICH_AP1R2_EL2); + break; + case 3: + val = read_gicreg(ICH_AP1R3_EL2); + break; + default: + unreachable(); + } + + return val; +} + +static u64 compute_ich_hcr(struct vgic_v3_cpu_if *cpu_if) +{ + return cpu_if->vgic_hcr | vgic_ich_hcr_trap_bits(); +} + +void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) +{ + u64 used_lrs = cpu_if->used_lrs; + + /* + * Make sure stores to the GIC via the memory mapped interface + * are now visible to the system register interface when reading the + * LRs, and when reading back the VMCR on non-VHE systems. + */ + if (used_lrs || !has_vhe()) { + if (!cpu_if->vgic_sre) { + dsb(sy); + isb(); + } + } + + if (used_lrs) { + int i; + u32 elrsr; + + elrsr = read_gicreg(ICH_ELRSR_EL2); + + for (i = 0; i < used_lrs; i++) { + if (elrsr & (1 << i)) + cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; + else + cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); + + __gic_v3_set_lr(0, i); + } + } + + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { + u64 val = read_gicreg(ICH_HCR_EL2); + cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; + cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; + } + + write_gicreg(0, ICH_HCR_EL2); + + /* + * Hack alert: On NV, this results in a trap so that the above write + * actually takes effect... No synchronisation is necessary, as we + * only care about the effects when this traps. + */ + read_gicreg(ICH_MISR_EL2); +} + +void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) +{ + u64 used_lrs = cpu_if->used_lrs; + int i; + + write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); + + for (i = 0; i < used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); + + /* + * Ensure that writes to the LRs, and on non-VHE systems ensure that + * the write to the VMCR in __vgic_v3_activate_traps(), will have + * reached the (re)distributors. This ensure the guest will read the + * correct values from the memory-mapped interface. + */ + if (used_lrs || !has_vhe()) { + if (!cpu_if->vgic_sre) { + isb(); + dsb(sy); + } + } +} + +void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) +{ + /* + * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a + * Group0 interrupt (as generated in GICv2 mode) to be + * delivered as a FIQ to the guest, with potentially fatal + * consequences. So we must make sure that ICC_SRE_EL1 has + * been actually programmed with the value we want before + * starting to mess with the rest of the GIC, and VMCR_EL2 in + * particular. This logic must be called before + * __vgic_v3_restore_state(). + * + * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is + * provisioned at all. In order to prevent illegal accesses to the + * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 + * so that the trap bits can take effect. Yes, we *loves* the GIC. + */ + if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) { + write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); + isb(); + } else if (!cpu_if->vgic_sre) { + write_gicreg(0, ICC_SRE_EL1); + isb(); + write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); + + + if (has_vhe()) { + /* + * Ensure that the write to the VMCR will have reached + * the (re)distributors. This ensure the guest will + * read the correct values from the memory-mapped + * interface. + */ + isb(); + dsb(sy); + } + } + + /* Only disable SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { + /* + * Prevent the guest from touching the ICC_SRE_EL1 system + * register. Note that this may not have any effect, as + * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. + */ + write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, + ICC_SRE_EL2); + } + + /* + * If we need to trap system registers, we must write ICH_HCR_EL2 + * anyway, even if no interrupts are being injected. Note that this + * also applies if we don't expect any system register access (no + * vgic at all). In any case, no need to provide MI configuration. + */ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) || + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) + write_gicreg(vgic_ich_hcr_trap_bits() | ICH_HCR_EL2_En, ICH_HCR_EL2); +} + +void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) +{ + u64 val; + + /* Only restore SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { + val = read_gicreg(ICC_SRE_EL2); + write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); + + if (!cpu_if->vgic_sre) { + /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ + isb(); + write_gicreg(1, ICC_SRE_EL1); + } + } + + /* + * If we were trapping system registers, we enabled the VGIC even if + * no interrupts were being injected, and we disable it again here. + */ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) || + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) + write_gicreg(0, ICH_HCR_EL2); +} + +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + u64 val; + u32 nr_pre_bits; + + val = read_gicreg(ICH_VTR_EL2); + nr_pre_bits = vtr_to_nr_pre_bits(val); + + switch (nr_pre_bits) { + case 7: + cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3); + cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); + fallthrough; + case 6: + cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1); + fallthrough; + default: + cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0); + } + + switch (nr_pre_bits) { + case 7: + cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3); + cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2); + fallthrough; + case 6: + cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1); + fallthrough; + default: + cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0); + } +} + +static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + u64 val; + u32 nr_pre_bits; + + val = read_gicreg(ICH_VTR_EL2); + nr_pre_bits = vtr_to_nr_pre_bits(val); + + switch (nr_pre_bits) { + case 7: + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3); + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2); + fallthrough; + case 6: + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1); + fallthrough; + default: + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0); + } + + switch (nr_pre_bits) { + case 7: + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3); + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2); + fallthrough; + case 6: + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1); + fallthrough; + default: + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0); + } +} + +void __vgic_v3_init_lrs(void) +{ + int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); + int i; + + for (i = 0; i <= max_lr_idx; i++) + __gic_v3_set_lr(0, i); +} + +/* + * Return the GIC CPU configuration: + * - [31:0] ICH_VTR_EL2 + * - [62:32] RES0 + * - [63] MMIO (GICv2) capable + */ +u64 __vgic_v3_get_gic_config(void) +{ + u64 val, sre; + unsigned long flags = 0; + + /* + * In compat mode, we cannot access ICC_SRE_EL1 at any EL + * other than EL1 itself; just return the + * ICH_VTR_EL2. ICC_IDR0_EL1 is only implemented on a GICv5 + * system, so we first check if we have GICv5 support. + */ + if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return read_gicreg(ICH_VTR_EL2); + + sre = read_gicreg(ICC_SRE_EL1); + /* + * To check whether we have a MMIO-based (GICv2 compatible) + * CPU interface, we need to disable the system register + * view. + * + * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates + * that to be able to set ICC_SRE_EL1.SRE to 0, all the + * interrupt overrides must be set. You've got to love this. + * + * As we always run VHE with HCR_xMO set, no extra xMO + * manipulation is required in that case. + * + * To safely disable SRE, we have to prevent any interrupt + * from firing (which would be deadly). This only makes sense + * on VHE, as interrupts are already masked for nVHE as part + * of the exception entry to EL2. + */ + if (has_vhe()) { + flags = local_daif_save(); + } else { + sysreg_clear_set_hcr(0, HCR_AMO | HCR_FMO | HCR_IMO); + isb(); + } + + write_gicreg(0, ICC_SRE_EL1); + isb(); + + val = read_gicreg(ICC_SRE_EL1); + + write_gicreg(sre, ICC_SRE_EL1); + isb(); + + if (has_vhe()) { + local_daif_restore(flags); + } else { + sysreg_clear_set_hcr(HCR_AMO | HCR_FMO | HCR_IMO, 0); + isb(); + } + + val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); + val |= read_gicreg(ICH_VTR_EL2); + + return val; +} + +static void __vgic_v3_compat_mode_enable(void) +{ + if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return; + + sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, 0, ICH_VCTLR_EL2_V3); + /* Wait for V3 to become enabled */ + isb(); +} + +static u64 __vgic_v3_read_vmcr(void) +{ + return read_gicreg(ICH_VMCR_EL2); +} + +static void __vgic_v3_write_vmcr(u32 vmcr) +{ + write_gicreg(vmcr, ICH_VMCR_EL2); +} + +void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + __vgic_v3_compat_mode_enable(); + + /* + * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen + * is dependent on ICC_SRE_EL1.SRE, and we have to perform the + * VMCR_EL2 save/restore in the world switch. + */ + if (cpu_if->vgic_sre) + __vgic_v3_write_vmcr(cpu_if->vgic_vmcr); + __vgic_v3_restore_aprs(cpu_if); +} + +static int __vgic_v3_bpr_min(void) +{ + /* See Pseudocode for VPriorityGroup */ + return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2)); +} + +static int __vgic_v3_get_group(struct kvm_vcpu *vcpu) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; + + return crm != 8; +} + +#define GICv3_IDLE_PRIORITY 0xff + +static int __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, u32 vmcr, + u64 *lr_val) +{ + unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; + u8 priority = GICv3_IDLE_PRIORITY; + int i, lr = -1; + + for (i = 0; i < used_lrs; i++) { + u64 val = __gic_v3_get_lr(i); + u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; + + /* Not pending in the state? */ + if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT) + continue; + + /* Group-0 interrupt, but Group-0 disabled? */ + if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK)) + continue; + + /* Group-1 interrupt, but Group-1 disabled? */ + if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK)) + continue; + + /* Not the highest priority? */ + if (lr_prio >= priority) + continue; + + /* This is a candidate */ + priority = lr_prio; + *lr_val = val; + lr = i; + } + + if (lr == -1) + *lr_val = ICC_IAR1_EL1_SPURIOUS; + + return lr; +} + +static int __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, int intid, + u64 *lr_val) +{ + unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; + int i; + + for (i = 0; i < used_lrs; i++) { + u64 val = __gic_v3_get_lr(i); + + if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid && + (val & ICH_LR_ACTIVE_BIT)) { + *lr_val = val; + return i; + } + } + + *lr_val = ICC_IAR1_EL1_SPURIOUS; + return -1; +} + +static int __vgic_v3_get_highest_active_priority(void) +{ + u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); + u32 hap = 0; + int i; + + for (i = 0; i < nr_apr_regs; i++) { + u32 val; + + /* + * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers + * contain the active priority levels for this VCPU + * for the maximum number of supported priority + * levels, and we return the full priority level only + * if the BPR is programmed to its minimum, otherwise + * we return a combination of the priority level and + * subpriority, as determined by the setting of the + * BPR, but without the full subpriority. + */ + val = __vgic_v3_read_ap0rn(i); + val |= __vgic_v3_read_ap1rn(i); + if (!val) { + hap += 32; + continue; + } + + return (hap + __ffs(val)) << __vgic_v3_bpr_min(); + } + + return GICv3_IDLE_PRIORITY; +} + +static unsigned int __vgic_v3_get_bpr0(u32 vmcr) +{ + return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; +} + +static unsigned int __vgic_v3_get_bpr1(u32 vmcr) +{ + unsigned int bpr; + + if (vmcr & ICH_VMCR_CBPR_MASK) { + bpr = __vgic_v3_get_bpr0(vmcr); + if (bpr < 7) + bpr++; + } else { + bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + } + + return bpr; +} + +/* + * Convert a priority to a preemption level, taking the relevant BPR + * into account by zeroing the sub-priority bits. + */ +static u8 __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp) +{ + unsigned int bpr; + + if (!grp) + bpr = __vgic_v3_get_bpr0(vmcr) + 1; + else + bpr = __vgic_v3_get_bpr1(vmcr); + + return pri & (GENMASK(7, 0) << bpr); +} + +/* + * The priority value is independent of any of the BPR values, so we + * normalize it using the minimal BPR value. This guarantees that no + * matter what the guest does with its BPR, we can always set/get the + * same value of a priority. + */ +static void __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp) +{ + u8 pre, ap; + u32 val; + int apr; + + pre = __vgic_v3_pri_to_pre(pri, vmcr, grp); + ap = pre >> __vgic_v3_bpr_min(); + apr = ap / 32; + + if (!grp) { + val = __vgic_v3_read_ap0rn(apr); + __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr); + } else { + val = __vgic_v3_read_ap1rn(apr); + __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr); + } +} + +static int __vgic_v3_clear_highest_active_priority(void) +{ + u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); + u32 hap = 0; + int i; + + for (i = 0; i < nr_apr_regs; i++) { + u32 ap0, ap1; + int c0, c1; + + ap0 = __vgic_v3_read_ap0rn(i); + ap1 = __vgic_v3_read_ap1rn(i); + if (!ap0 && !ap1) { + hap += 32; + continue; + } + + c0 = ap0 ? __ffs(ap0) : 32; + c1 = ap1 ? __ffs(ap1) : 32; + + /* Always clear the LSB, which is the highest priority */ + if (c0 < c1) { + ap0 &= ~BIT(c0); + __vgic_v3_write_ap0rn(ap0, i); + hap += c0; + } else { + ap1 &= ~BIT(c1); + __vgic_v3_write_ap1rn(ap1, i); + hap += c1; + } + + /* Rescale to 8 bits of priority */ + return hap << __vgic_v3_bpr_min(); + } + + return GICv3_IDLE_PRIORITY; +} + +static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 lr_val; + u8 lr_prio, pmr; + int lr, grp; + + grp = __vgic_v3_get_group(vcpu); + + lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); + if (lr < 0) + goto spurious; + + if (grp != !!(lr_val & ICH_LR_GROUP)) + goto spurious; + + pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; + if (pmr <= lr_prio) + goto spurious; + + if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp)) + goto spurious; + + lr_val &= ~ICH_LR_STATE; + lr_val |= ICH_LR_ACTIVE_BIT; + __gic_v3_set_lr(lr_val, lr); + __vgic_v3_set_active_priority(lr_prio, vmcr, grp); + vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); + return; + +spurious: + vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS); +} + +static void __vgic_v3_clear_active_lr(int lr, u64 lr_val) +{ + lr_val &= ~ICH_LR_ACTIVE_BIT; + if (lr_val & ICH_LR_HW) { + u32 pid; + + pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT; + gic_write_dir(pid); + } + + __gic_v3_set_lr(lr_val, lr); +} + +static void __vgic_v3_bump_eoicount(void) +{ + u32 hcr; + + hcr = read_gicreg(ICH_HCR_EL2); + hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT; + write_gicreg(hcr, ICH_HCR_EL2); +} + +static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u32 vid = vcpu_get_reg(vcpu, rt); + u64 lr_val; + int lr; + + /* EOImode == 0, nothing to be done here */ + if (!(vmcr & ICH_VMCR_EOIM_MASK)) + return 1; + + /* No deactivate to be performed on an LPI */ + if (vid >= VGIC_MIN_LPI) + return 1; + + lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); + if (lr != -1) { + __vgic_v3_clear_active_lr(lr, lr_val); + return 1; + } + + return 0; +} + +static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + if (!___vgic_v3_write_dir(vcpu, vmcr, rt)) + __vgic_v3_bump_eoicount(); +} + +static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u32 vid = vcpu_get_reg(vcpu, rt); + u64 lr_val; + u8 lr_prio, act_prio; + int lr, grp; + + grp = __vgic_v3_get_group(vcpu); + + /* Drop priority in any case */ + act_prio = __vgic_v3_clear_highest_active_priority(); + + lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); + if (lr == -1) { + /* Do not bump EOIcount for LPIs that aren't in the LRs */ + if (!(vid >= VGIC_MIN_LPI)) + __vgic_v3_bump_eoicount(); + return; + } + + /* EOImode == 1 and not an LPI, nothing to be done here */ + if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI)) + return; + + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; + + /* If priorities or group do not match, the guest has fscked-up. */ + if (grp != !!(lr_val & ICH_LR_GROUP) || + __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio) + return; + + /* Let's now perform the deactivation */ + __vgic_v3_clear_active_lr(lr, lr_val); +} + +static void __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK)); +} + +static void __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK)); +} + +static void __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + + if (val & 1) + vmcr |= ICH_VMCR_ENG0_MASK; + else + vmcr &= ~ICH_VMCR_ENG0_MASK; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + + if (val & 1) + vmcr |= ICH_VMCR_ENG1_MASK; + else + vmcr &= ~ICH_VMCR_ENG1_MASK; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr)); +} + +static void __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr)); +} + +static void __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + u8 bpr_min = __vgic_v3_bpr_min() - 1; + + /* Enforce BPR limiting */ + if (val < bpr_min) + val = bpr_min; + + val <<= ICH_VMCR_BPR0_SHIFT; + val &= ICH_VMCR_BPR0_MASK; + vmcr &= ~ICH_VMCR_BPR0_MASK; + vmcr |= val; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + u8 bpr_min = __vgic_v3_bpr_min(); + + if (vmcr & ICH_VMCR_CBPR_MASK) + return; + + /* Enforce BPR limiting */ + if (val < bpr_min) + val = bpr_min; + + val <<= ICH_VMCR_BPR1_SHIFT; + val &= ICH_VMCR_BPR1_MASK; + vmcr &= ~ICH_VMCR_BPR1_MASK; + vmcr |= val; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n) +{ + u32 val; + + if (!__vgic_v3_get_group(vcpu)) + val = __vgic_v3_read_ap0rn(n); + else + val = __vgic_v3_read_ap1rn(n); + + vcpu_set_reg(vcpu, rt, val); +} + +static void __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n) +{ + u32 val = vcpu_get_reg(vcpu, rt); + + if (!__vgic_v3_get_group(vcpu)) + __vgic_v3_write_ap0rn(val, n); + else + __vgic_v3_write_ap1rn(val, n); +} + +static void __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 0); +} + +static void __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 1); +} + +static void __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 2); +} + +static void __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 3); +} + +static void __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 0); +} + +static void __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 1); +} + +static void __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 2); +} + +static void __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 3); +} + +static void __vgic_v3_read_hppir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 lr_val; + int lr, lr_grp, grp; + + grp = __vgic_v3_get_group(vcpu); + + lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); + if (lr == -1) + goto spurious; + + lr_grp = !!(lr_val & ICH_LR_GROUP); + if (lr_grp != grp) + lr_val = ICC_IAR1_EL1_SPURIOUS; + +spurious: + vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); +} + +static void __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vmcr &= ICH_VMCR_PMR_MASK; + vmcr >>= ICH_VMCR_PMR_SHIFT; + vcpu_set_reg(vcpu, rt, vmcr); +} + +static void __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u32 val = vcpu_get_reg(vcpu, rt); + + val <<= ICH_VMCR_PMR_SHIFT; + val &= ICH_VMCR_PMR_MASK; + vmcr &= ~ICH_VMCR_PMR_MASK; + vmcr |= val; + + write_gicreg(vmcr, ICH_VMCR_EL2); +} + +static void __vgic_v3_read_rpr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u32 val = __vgic_v3_get_highest_active_priority(); + vcpu_set_reg(vcpu, rt, val); +} + +static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u32 vtr, val; + + vtr = read_gicreg(ICH_VTR_EL2); + /* PRIbits */ + val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; + /* IDbits */ + val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; + /* A3V */ + val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; + /* EOImode */ + val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT; + /* CBPR */ + val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; + + vcpu_set_reg(vcpu, rt, val); +} + +static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u32 val = vcpu_get_reg(vcpu, rt); + + if (val & ICC_CTLR_EL1_CBPR_MASK) + vmcr |= ICH_VMCR_CBPR_MASK; + else + vmcr &= ~ICH_VMCR_CBPR_MASK; + + if (val & ICC_CTLR_EL1_EOImode_MASK) + vmcr |= ICH_VMCR_EOIM_MASK; + else + vmcr &= ~ICH_VMCR_EOIM_MASK; + + write_gicreg(vmcr, ICH_VMCR_EL2); +} + +static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, + u32 sysreg, bool is_read) +{ + u64 ich_hcr; + + if (!is_nested_ctxt(vcpu)) + return false; + + ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + switch (sysreg) { + case SYS_ICC_IGRPEN0_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_BPR0_EL1: + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_IAR0_EL1: + return ich_hcr & ICH_HCR_EL2_TALL0; + + case SYS_ICC_IGRPEN1_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP1Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(3): + case SYS_ICC_BPR1_EL1: + case SYS_ICC_EOIR1_EL1: + case SYS_ICC_HPPIR1_EL1: + case SYS_ICC_IAR1_EL1: + return ich_hcr & ICH_HCR_EL2_TALL1; + + case SYS_ICC_DIR_EL1: + if (ich_hcr & ICH_HCR_EL2_TDIR) + return true; + + fallthrough; + + case SYS_ICC_RPR_EL1: + case SYS_ICC_CTLR_EL1: + case SYS_ICC_PMR_EL1: + return ich_hcr & ICH_HCR_EL2_TC; + + default: + return false; + } +} + +int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) +{ + int rt; + u64 esr; + u32 vmcr; + void (*fn)(struct kvm_vcpu *, u32, int); + bool is_read; + u32 sysreg; + + if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return 0; + + esr = kvm_vcpu_get_esr(vcpu); + if (vcpu_mode_is_32bit(vcpu)) { + if (!kvm_condition_valid(vcpu)) { + __kvm_skip_instr(vcpu); + return 1; + } + + sysreg = esr_cp15_to_sysreg(esr); + } else { + sysreg = esr_sys64_to_sysreg(esr); + } + + is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + + if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read)) + return 0; + + switch (sysreg) { + case SYS_ICC_IAR0_EL1: + case SYS_ICC_IAR1_EL1: + if (unlikely(!is_read)) + return 0; + fn = __vgic_v3_read_iar; + break; + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_EOIR1_EL1: + if (unlikely(is_read)) + return 0; + fn = __vgic_v3_write_eoir; + break; + case SYS_ICC_IGRPEN1_EL1: + if (is_read) + fn = __vgic_v3_read_igrpen1; + else + fn = __vgic_v3_write_igrpen1; + break; + case SYS_ICC_BPR1_EL1: + if (is_read) + fn = __vgic_v3_read_bpr1; + else + fn = __vgic_v3_write_bpr1; + break; + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(0): + if (is_read) + fn = __vgic_v3_read_apxr0; + else + fn = __vgic_v3_write_apxr0; + break; + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(1): + if (is_read) + fn = __vgic_v3_read_apxr1; + else + fn = __vgic_v3_write_apxr1; + break; + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(2): + if (is_read) + fn = __vgic_v3_read_apxr2; + else + fn = __vgic_v3_write_apxr2; + break; + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_AP1Rn_EL1(3): + if (is_read) + fn = __vgic_v3_read_apxr3; + else + fn = __vgic_v3_write_apxr3; + break; + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_HPPIR1_EL1: + if (unlikely(!is_read)) + return 0; + fn = __vgic_v3_read_hppir; + break; + case SYS_ICC_IGRPEN0_EL1: + if (is_read) + fn = __vgic_v3_read_igrpen0; + else + fn = __vgic_v3_write_igrpen0; + break; + case SYS_ICC_BPR0_EL1: + if (is_read) + fn = __vgic_v3_read_bpr0; + else + fn = __vgic_v3_write_bpr0; + break; + case SYS_ICC_DIR_EL1: + if (unlikely(is_read)) + return 0; + /* + * Full exit if required to handle overflow deactivation, + * unless we can emulate it in the LRs (likely the majority + * of the cases). + */ + if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) { + int ret; + + ret = ___vgic_v3_write_dir(vcpu, __vgic_v3_read_vmcr(), + kvm_vcpu_sys_get_rt(vcpu)); + if (ret) + __kvm_skip_instr(vcpu); + + return ret; + } + fn = __vgic_v3_write_dir; + break; + case SYS_ICC_RPR_EL1: + if (unlikely(!is_read)) + return 0; + fn = __vgic_v3_read_rpr; + break; + case SYS_ICC_CTLR_EL1: + if (is_read) + fn = __vgic_v3_read_ctlr; + else + fn = __vgic_v3_write_ctlr; + break; + case SYS_ICC_PMR_EL1: + if (is_read) + fn = __vgic_v3_read_pmr; + else + fn = __vgic_v3_write_pmr; + break; + default: + return 0; + } + + vmcr = __vgic_v3_read_vmcr(); + rt = kvm_vcpu_sys_get_rt(vcpu); + fn(vcpu, vmcr, rt); + + __kvm_skip_instr(vcpu); + + return 1; +} diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile new file mode 100644 index 000000000000..afc4aed9231a --- /dev/null +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Kernel-based Virtual Machine module, HYP/VHE part +# + +asflags-y := -D__KVM_VHE_HYPERVISOR__ +ccflags-y := -D__KVM_VHE_HYPERVISOR__ + +CFLAGS_switch.o += -Wno-override-init + +obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o +obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ + ../fpsimd.o ../hyp-entry.o ../exception.o diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c new file mode 100644 index 000000000000..0100339b09e0 --- /dev/null +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <hyp/debug-sr.h> + +#include <linux/kvm_host.h> + +#include <asm/kvm_hyp.h> + +void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +{ + __debug_switch_to_guest_common(vcpu); +} + +void __debug_switch_to_host(struct kvm_vcpu *vcpu) +{ + __debug_switch_to_host_common(vcpu); +} diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c new file mode 100644 index 000000000000..9984c492305a --- /dev/null +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -0,0 +1,688 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <hyp/switch.h> + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/percpu.h> +#include <uapi/linux/psci.h> + +#include <kvm/arm_psci.h> + +#include <asm/barrier.h> +#include <asm/cpufeature.h> +#include <asm/kprobes.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/fpsimd.h> +#include <asm/debug-monitors.h> +#include <asm/processor.h> +#include <asm/thread_info.h> +#include <asm/vectors.h> + +/* VHE specific context */ +DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); +DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); +DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); + +/* + * HCR_EL2 bits that the NV guest can freely change (no RES0/RES1 + * semantics, irrespective of the configuration), but that cannot be + * applied to the actual HW as things would otherwise break badly. + * + * - TGE: we want the guest to use EL1, which is incompatible with + * this bit being set + * + * - API/APK: they are already accounted for by vcpu_load(), and can + * only take effect across a load/put cycle (such as ERET) + * + * - FIEN: no way we let a guest have access to the RAS "Common Fault + * Injection" thing, whatever that does + */ +#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK | HCR_FIEN) + +static u64 __compute_hcr(struct kvm_vcpu *vcpu) +{ + u64 guest_hcr, hcr = vcpu->arch.hcr_el2; + + if (!vcpu_has_nv(vcpu)) + return hcr; + + /* + * We rely on the invariant that a vcpu entered from HYP + * context must also exit in the same context, as only an ERET + * instruction can kick us out of it, and we obviously trap + * that sucker. PSTATE.M will get fixed-up on exit. + */ + if (is_hyp_ctxt(vcpu)) { + host_data_set_flag(VCPU_IN_HYP_CONTEXT); + + hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; + + if (!vcpu_el2_e2h_is_set(vcpu)) + hcr |= HCR_NV1; + + /* + * Nothing in HCR_EL2 should impact running in hypervisor + * context, apart from bits we have defined as RESx (E2H, + * HCD and co), or that cannot be set directly (the EXCLUDE + * bits). Given that we OR the guest's view with the host's, + * we can use the 0 value as the starting point, and only + * use the config-driven RES1 bits. + */ + guest_hcr = kvm_vcpu_apply_reg_masks(vcpu, HCR_EL2, 0); + + write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); + } else { + host_data_clear_flag(VCPU_IN_HYP_CONTEXT); + + guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + if (guest_hcr & HCR_NV) { + u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id())); + + /* Inherit the low bits from the actual register */ + va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0); + write_sysreg_s(va, SYS_VNCR_EL2); + + /* Force NV2 in case the guest is forgetful... */ + guest_hcr |= HCR_NV2; + } + + /* + * Exclude the guest's TWED configuration if it hasn't set TWE + * to avoid potentially delaying traps for the host. + */ + if (!(guest_hcr & HCR_TWE)) + guest_hcr &= ~(HCR_EL2_TWEDEn | HCR_EL2_TWEDEL); + } + + BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) && + host_data_test_flag(L1_VNCR_MAPPED)); + + return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); +} + +static void __activate_traps(struct kvm_vcpu *vcpu) +{ + u64 val; + + ___activate_traps(vcpu, __compute_hcr(vcpu)); + + if (has_cntpoff()) { + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * We're entrering the guest. Reload the correct + * values from memory now that TGE is clear. + */ + if (map.direct_ptimer == vcpu_ptimer(vcpu)) + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + if (map.direct_ptimer == vcpu_hptimer(vcpu)) + val = __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); + + if (map.direct_ptimer) { + write_sysreg_el0(val, SYS_CNTP_CVAL); + isb(); + } + } + + __activate_cptr_traps(vcpu); + + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); +} +NOKPROBE_SYMBOL(__activate_traps); + +static void __deactivate_traps(struct kvm_vcpu *vcpu) +{ + const char *host_vectors = vectors; + + ___deactivate_traps(vcpu); + + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); + + if (has_cntpoff()) { + struct timer_map map; + u64 val, offset; + + get_timer_map(vcpu, &map); + + /* + * We're exiting the guest. Save the latest CVAL value + * to memory and apply the offset now that TGE is set. + */ + val = read_sysreg_el0(SYS_CNTP_CVAL); + if (map.direct_ptimer == vcpu_ptimer(vcpu)) + __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, val); + if (map.direct_ptimer == vcpu_hptimer(vcpu)) + __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, val); + + offset = read_sysreg_s(SYS_CNTPOFF_EL2); + + if (map.direct_ptimer && offset) { + write_sysreg_el0(val + offset, SYS_CNTP_CVAL); + isb(); + } + } + + /* + * ARM errata 1165522 and 1530923 require the actual execution of the + * above before we can switch to the EL2/EL0 translation regime used by + * the host. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + + __deactivate_cptr_traps(vcpu); + + if (!arm64_kernel_unmapped_at_el0()) + host_vectors = __this_cpu_read(this_cpu_vector); + write_sysreg(host_vectors, vbar_el1); +} +NOKPROBE_SYMBOL(__deactivate_traps); + +/* + * Disable IRQs in __vcpu_{load,put}_{activate,deactivate}_traps() to + * prevent a race condition between context switching of PMUSERENR_EL0 + * in __{activate,deactivate}_traps_common() and IPIs that attempts to + * update PMUSERENR_EL0. See also kvm_set_pmuserenr(). + */ +static void __vcpu_load_activate_traps(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + local_irq_save(flags); + __activate_traps_common(vcpu); + local_irq_restore(flags); +} + +static void __vcpu_put_deactivate_traps(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + local_irq_save(flags); + __deactivate_traps_common(vcpu); + local_irq_restore(flags); +} + +void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu) +{ + host_data_ptr(host_ctxt)->__hyp_running_vcpu = vcpu; + + __vcpu_load_switch_sysregs(vcpu); + __vcpu_load_activate_traps(vcpu); + __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); +} + +void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) +{ + __vcpu_put_deactivate_traps(vcpu); + __vcpu_put_switch_sysregs(vcpu); + + host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL; +} + +static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) +{ + unsigned long ctl; + u64 cval, cnt; + bool stat; + + switch (reg) { + case CNTP_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + cnt = compute_counter_value(vcpu_ptimer(vcpu)); + break; + case CNTV_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); + cnt = compute_counter_value(vcpu_vtimer(vcpu)); + break; + default: + BUG(); + } + + stat = cval <= cnt; + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat); + + return ctl; +} + +static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr, val; + + /* + * Having FEAT_ECV allows for a better quality of timer emulation. + * However, this comes at a huge cost in terms of traps. Try and + * satisfy the reads from guest's hypervisor context without + * returning to the kernel if we can. + */ + if (!is_hyp_ctxt(vcpu)) + return false; + + esr = kvm_vcpu_get_esr(vcpu); + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ) + return false; + + switch (esr_sys64_to_sysreg(esr)) { + case SYS_CNTP_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTP_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + break; + case SYS_CNTP_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el0(SYS_CNTP_CVAL); + + if (!has_cntpoff()) + val -= timer_get_offset(vcpu_hptimer(vcpu)); + } else { + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + } + break; + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + val = compute_counter_value(vcpu_hptimer(vcpu)); + break; + case SYS_CNTV_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTV_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CVAL); + else + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + val = compute_counter_value(vcpu_hvtimer(vcpu)); + break; + default: + return false; + } + + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 spsr, elr, mode; + + /* + * Going through the whole put/load motions is a waste of time + * if this is a VHE guest hypervisor returning to its own + * userspace, or the hypervisor performing a local exception + * return. No need to save/restore registers, no need to + * switch S2 MMU. Just do the canonical ERET. + * + * Unless the trap has to be forwarded further down the line, + * of course... + */ + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) || + (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_ERET)) + return false; + + spsr = read_sysreg_el1(SYS_SPSR); + mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL0t: + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + return false; + break; + case PSR_MODE_EL2t: + mode = PSR_MODE_EL1t; + break; + case PSR_MODE_EL2h: + mode = PSR_MODE_EL1h; + break; + default: + return false; + } + + /* If ERETAx fails, take the slow path */ + if (esr_iss_is_eretax(esr)) { + if (!(vcpu_has_ptrauth(vcpu) && kvm_auth_eretax(vcpu, &elr))) + return false; + } else { + elr = read_sysreg_el1(SYS_ELR); + } + + spsr = (spsr & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; + + write_sysreg_el2(spsr, SYS_SPSR); + write_sysreg_el2(elr, SYS_ELR); + + return true; +} + +static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + int ret = -EINVAL; + u32 instr; + u64 val; + + /* + * Ideally, we would never trap on EL2 S1 TLB invalidations using + * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}. + * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2, + * meaning that we can't track changes to the virtual TGE bit. So we + * have to leave HCR_EL2.TTLB set on the host. Oopsie... + * + * Try and handle these invalidation as quickly as possible, without + * fully exiting. Note that we don't need to consider any forwarding + * here, as having E2H+TGE set is the very definition of being + * InHost. + * + * For the lesser hypervisors out there that have failed to get on + * with the VHE program, we can also handle the nVHE style of EL2 + * invalidation. + */ + if (!(is_hyp_ctxt(vcpu))) + return false; + + instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + + if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) && + vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) || + kvm_supported_tlbi_s1e2_op (vcpu, instr)) + ret = __kvm_tlbi_s1e2(NULL, val, instr); + + if (ret) + return false; + + /* + * If we have to check for any VNCR mapping being invalidated, + * go back to the slow path for further processing. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) && + atomic_read(&vcpu->kvm->arch.vncr_map_count)) + return false; + + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + int rt; + + if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1) + return false; + + rt = kvm_vcpu_sys_get_rt(vcpu); + + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) { + vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2)); + } else { + vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2); + __activate_cptr_traps(vcpu); + } + + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + + if (!vcpu_has_nv(vcpu)) + return false; + + if (sysreg != SYS_ZCR_EL2) + return false; + + if (guest_owns_fp_regs()) + return false; + + /* + * ZCR_EL2 traps are handled in the slow path, with the expectation + * that the guest's FP context has already been loaded onto the CPU. + * + * Load the guest's FP context and unconditionally forward to the + * slow path for handling (i.e. return false). + */ + kvm_hyp_handle_fpsimd(vcpu, exit_code); + return false; +} + +static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_timer(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_zcr_el2(vcpu, exit_code)) + return true; + + return kvm_hyp_handle_sysreg(vcpu, exit_code); +} + +static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 iss; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return false; + + /* + * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2 + * is populated with a correct ISS for a sysreg trap. These fruity + * parts are 64bit only, so unconditionally set IL. + */ + iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2)); + vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) | + FIELD_PREP(ESR_ELx_ISS_MASK, iss) | + ESR_ELx_IL; + return false; +} + +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg_vhe, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, + [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret, + [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, + + /* Apple shenanigans */ + [0x3F] = kvm_hyp_handle_impdef, +}; + +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + synchronize_vcpu_pstate(vcpu, exit_code); + + /* + * If we were in HYP context on entry, adjust the PSTATE view + * so that the usual helpers work correctly. This enforces our + * invariant that the guest's HYP context status is preserved + * across a run. + */ + if (vcpu_has_nv(vcpu) && + unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) { + u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL1t: + mode = PSR_MODE_EL2t; + break; + case PSR_MODE_EL1h: + mode = PSR_MODE_EL2h; + break; + } + + *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); + *vcpu_cpsr(vcpu) |= mode; + } + + /* Apply extreme paranoia! */ + BUG_ON(vcpu_has_nv(vcpu) && + !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu)); + + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); +} + +/* Switch to the guest for VHE systems running in EL2 */ +static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *host_ctxt; + struct kvm_cpu_context *guest_ctxt; + u64 exit_code; + + host_ctxt = host_data_ptr(host_ctxt); + guest_ctxt = &vcpu->arch.ctxt; + + fpsimd_lazy_switch_to_guest(vcpu); + + sysreg_save_host_state_vhe(host_ctxt); + + /* + * Note that ARM erratum 1165522 requires us to configure both stage 1 + * and stage 2 translation for the guest context before we clear + * HCR_EL2.TGE. The stage 1 and stage 2 guest context has already been + * loaded on the CPU in kvm_vcpu_load_vhe(). + */ + __activate_traps(vcpu); + + __kvm_adjust_pc(vcpu); + + sysreg_restore_guest_state_vhe(guest_ctxt); + __debug_switch_to_guest(vcpu); + + do { + /* Jump in the fire! */ + exit_code = __guest_enter(vcpu); + + /* And we're baaack! */ + } while (fixup_guest_exit(vcpu, &exit_code)); + + sysreg_save_guest_state_vhe(guest_ctxt); + + __deactivate_traps(vcpu); + + sysreg_restore_host_state_vhe(host_ctxt); + + __debug_switch_to_host(vcpu); + + /* + * Ensure that all system register writes above have taken effect + * before returning to the host. In VHE mode, CPTR traps for + * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be + * manipulated after the ISB. + */ + isb(); + + fpsimd_lazy_switch_to_host(vcpu); + + if (guest_owns_fp_regs()) + __fpsimd_save_fpexc32(vcpu); + + return exit_code; +} +NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); + +int __kvm_vcpu_run(struct kvm_vcpu *vcpu) +{ + int ret; + + local_daif_mask(); + + /* + * Having IRQs masked via PMR when entering the guest means the GIC + * will not signal the CPU of interrupts of lower priority, and the + * only way to get out will be via guest exceptions. + * Naturally, we want to avoid this. + * + * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a + * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU. + */ + pmr_sync(); + + ret = __kvm_vcpu_run_vhe(vcpu); + + /* + * local_daif_restore() takes care to properly restore PSTATE.DAIF + * and the GIC PMR if the host is using IRQ priorities. + */ + local_daif_restore(DAIF_PROCCTX_NOIRQ); + + return ret; +} + +static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) +{ + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = host_data_ptr(host_ctxt); + vcpu = host_ctxt->__hyp_running_vcpu; + + __deactivate_traps(vcpu); + sysreg_restore_host_state_vhe(host_ctxt); + + panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", + spsr, elr, + read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR), + read_sysreg(hpfar_el2), par, vcpu); +} +NOKPROBE_SYMBOL(__hyp_call_panic); + +void __noreturn hyp_panic(void) +{ + u64 spsr = read_sysreg_el2(SYS_SPSR); + u64 elr = read_sysreg_el2(SYS_ELR); + u64 par = read_sysreg_par(); + + __hyp_call_panic(spsr, elr, par); +} + +asmlinkage void kvm_unexpected_el2_exception(void) +{ + __kvm_unexpected_el2_exception(); +} diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c new file mode 100644 index 000000000000..f28c6cf4fe1b --- /dev/null +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <hyp/sysreg-sr.h> + +#include <linux/compiler.h> +#include <linux/kvm_host.h> + +#include <asm/kprobes.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_nested.h> + +static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) +{ + /* These registers are common with EL1 */ + __vcpu_assign_sys_reg(vcpu, PAR_EL1, read_sysreg(par_el1)); + __vcpu_assign_sys_reg(vcpu, TPIDR_EL1, read_sysreg(tpidr_el1)); + + __vcpu_assign_sys_reg(vcpu, ESR_EL2, read_sysreg_el1(SYS_ESR)); + __vcpu_assign_sys_reg(vcpu, AFSR0_EL2, read_sysreg_el1(SYS_AFSR0)); + __vcpu_assign_sys_reg(vcpu, AFSR1_EL2, read_sysreg_el1(SYS_AFSR1)); + __vcpu_assign_sys_reg(vcpu, FAR_EL2, read_sysreg_el1(SYS_FAR)); + __vcpu_assign_sys_reg(vcpu, MAIR_EL2, read_sysreg_el1(SYS_MAIR)); + __vcpu_assign_sys_reg(vcpu, VBAR_EL2, read_sysreg_el1(SYS_VBAR)); + __vcpu_assign_sys_reg(vcpu, CONTEXTIDR_EL2, read_sysreg_el1(SYS_CONTEXTIDR)); + __vcpu_assign_sys_reg(vcpu, AMAIR_EL2, read_sysreg_el1(SYS_AMAIR)); + + /* + * In VHE mode those registers are compatible between EL1 and EL2, + * and the guest uses the _EL1 versions on the CPU naturally. + * So we save them into their _EL2 versions here. + * For nVHE mode we trap accesses to those registers, so our + * _EL2 copy in sys_regs[] is always up-to-date and we don't need + * to save anything here. + */ + if (vcpu_el2_e2h_is_set(vcpu)) { + u64 val; + + /* + * We don't save CPTR_EL2, as accesses to CPACR_EL1 + * are always trapped, ensuring that the in-memory + * copy is always up-to-date. A small blessing... + */ + __vcpu_assign_sys_reg(vcpu, SCTLR_EL2, read_sysreg_el1(SYS_SCTLR)); + __vcpu_assign_sys_reg(vcpu, TTBR0_EL2, read_sysreg_el1(SYS_TTBR0)); + __vcpu_assign_sys_reg(vcpu, TTBR1_EL2, read_sysreg_el1(SYS_TTBR1)); + __vcpu_assign_sys_reg(vcpu, TCR_EL2, read_sysreg_el1(SYS_TCR)); + + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { + __vcpu_assign_sys_reg(vcpu, TCR2_EL2, read_sysreg_el1(SYS_TCR2)); + + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + __vcpu_assign_sys_reg(vcpu, PIRE0_EL2, read_sysreg_el1(SYS_PIRE0)); + __vcpu_assign_sys_reg(vcpu, PIR_EL2, read_sysreg_el1(SYS_PIR)); + } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + __vcpu_assign_sys_reg(vcpu, POR_EL2, read_sysreg_el1(SYS_POR)); + } + + /* + * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where + * the interesting CNTHCTL_EL2 bits live. So preserve these + * bits when reading back the guest-visible value. + */ + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, &=, ~CNTKCTL_VALID_BITS); + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, |=, val); + } + + __vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1)); + __vcpu_assign_sys_reg(vcpu, ELR_EL2, read_sysreg_el1(SYS_ELR)); + __vcpu_assign_sys_reg(vcpu, SPSR_EL2, read_sysreg_el1(SYS_SPSR)); + + if (ctxt_has_sctlr2(&vcpu->arch.ctxt)) + __vcpu_assign_sys_reg(vcpu, SCTLR2_EL2, read_sysreg_el1(SYS_SCTLR2)); +} + +static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* These registers are common with EL1 */ + write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); + write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); + + write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2); + write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); + write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); + + if (vcpu_el2_e2h_is_set(vcpu)) { + /* + * In VHE mode those registers are compatible between + * EL1 and EL2. + */ + write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL); + } else { + /* + * CNTHCTL_EL2 only affects EL1 when running nVHE, so + * no need to restore it. + */ + val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2)); + write_sysreg_el1(val, SYS_SCTLR); + val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2)); + write_sysreg_el1(val, SYS_CPACR); + val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2)); + write_sysreg_el1(val, SYS_TTBR0); + val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2)); + write_sysreg_el1(val, SYS_TCR); + } + + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2); + + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0); + } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR); + } + + write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR); + write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); + + if (ctxt_has_sctlr2(&vcpu->arch.ctxt)) + write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR2_EL2), SYS_SCTLR2); +} + +/* + * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and + * pstate, which are handled as part of the el2 return state) on every + * switch (sp_el0 is being dealt with in the assembly code). + * tpidr_el0 and tpidrro_el0 only need to be switched when going + * to host userspace or a different VCPU. EL1 registers only need to be + * switched when potentially going to run a different VCPU. The latter two + * classes are handled as part of kvm_arch_vcpu_load and kvm_arch_vcpu_put. + */ + +void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt) +{ + __sysreg_save_common_state(ctxt); +} +NOKPROBE_SYMBOL(sysreg_save_host_state_vhe); + +void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt) +{ + __sysreg_save_common_state(ctxt); + __sysreg_save_el2_return_state(ctxt); +} +NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe); + +void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt) +{ + __sysreg_restore_common_state(ctxt); +} +NOKPROBE_SYMBOL(sysreg_restore_host_state_vhe); + +void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) +{ + __sysreg_restore_common_state(ctxt); + __sysreg_restore_el2_return_state(ctxt); +} +NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); + +/** + * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU + * + * @vcpu: The VCPU pointer + * + * Load system registers that do not affect the host's execution, for + * example EL1 system registers on a VHE system where the host kernel + * runs at EL2. This function is called from KVM's vcpu_load() function + * and loading system register state early avoids having to load them on + * every entry to the VM. + */ +void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; + struct kvm_cpu_context *host_ctxt; + u64 midr, mpidr; + + host_ctxt = host_data_ptr(host_ctxt); + __sysreg_save_user_state(host_ctxt); + + /* + * When running a normal EL1 guest, we only load a new vcpu + * after a context switch, which imvolves a DSB, so all + * speculative EL1&0 walks will have already completed. + * If running NV, the vcpu may transition between vEL1 and + * vEL2 without a context switch, so make sure we complete + * those walks before loading a new context. + */ + if (vcpu_has_nv(vcpu)) + dsb(nsh); + + /* + * Load guest EL1 and user state + * + * We must restore the 32-bit state before the sysregs, thanks + * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72). + */ + __sysreg32_restore_state(vcpu); + __sysreg_restore_user_state(guest_ctxt); + + if (unlikely(is_hyp_ctxt(vcpu))) { + __sysreg_restore_vel2_state(vcpu); + } else { + if (vcpu_has_nv(vcpu)) { + /* + * As we're restoring a nested guest, set the value + * provided by the guest hypervisor. + */ + midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2); + mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); + } else { + midr = ctxt_midr_el1(guest_ctxt); + mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); + } + + __sysreg_restore_el1_state(guest_ctxt, midr, mpidr); + } + + vcpu_set_flag(vcpu, SYSREGS_ON_CPU); +} + +/** + * __vcpu_put_switch_sysregs - Restore host system registers to the physical CPU + * + * @vcpu: The VCPU pointer + * + * Save guest system registers that do not affect the host's execution, for + * example EL1 system registers on a VHE system where the host kernel + * runs at EL2. This function is called from KVM's vcpu_put() function + * and deferring saving system register state until we're no longer running the + * VCPU avoids having to save them on every exit from the VM. + */ +void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; + struct kvm_cpu_context *host_ctxt; + + host_ctxt = host_data_ptr(host_ctxt); + + if (unlikely(is_hyp_ctxt(vcpu))) + __sysreg_save_vel2_state(vcpu); + else + __sysreg_save_el1_state(guest_ctxt); + + __sysreg_save_user_state(guest_ctxt); + __sysreg32_save_state(vcpu); + + /* Restore host user state */ + __sysreg_restore_user_state(host_ctxt); + + vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); +} diff --git a/arch/arm64/kvm/hyp/vhe/timer-sr.c b/arch/arm64/kvm/hyp/vhe/timer-sr.c new file mode 100644 index 000000000000..4cda674a8be6 --- /dev/null +++ b/arch/arm64/kvm/hyp/vhe/timer-sr.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <asm/kvm_hyp.h> + +void __kvm_timer_set_cntvoff(u64 cntvoff) +{ + write_sysreg(cntvoff, cntvoff_el2); +} diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c new file mode 100644 index 000000000000..ec2569818629 --- /dev/null +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/irqflags.h> + +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/tlbflush.h> + +struct tlb_inv_context { + struct kvm_s2_mmu *mmu; + unsigned long flags; + u64 tcr; + u64 sctlr; +}; + +static void enter_vmid_context(struct kvm_s2_mmu *mmu, + struct tlb_inv_context *cxt) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + u64 val; + + local_irq_save(cxt->flags); + + if (vcpu && mmu != vcpu->arch.hw_mmu) + cxt->mmu = vcpu->arch.hw_mmu; + else + cxt->mmu = NULL; + + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { + /* + * For CPUs that are affected by ARM errata 1165522 or 1530923, + * we cannot trust stage-1 to be in a correct state at that + * point. Since we do not want to force a full load of the + * vcpu state, we prevent the EL1 page-table walker to + * allocate new TLBs. This is done by setting the EPD bits + * in the TCR_EL1 register. We also need to prevent it to + * allocate IPA->PA walks, so we enable the S1 MMU... + */ + val = cxt->tcr = read_sysreg_el1(SYS_TCR); + val |= TCR_EPD1_MASK | TCR_EPD0_MASK; + write_sysreg_el1(val, SYS_TCR); + val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR); + val |= SCTLR_ELx_M; + write_sysreg_el1(val, SYS_SCTLR); + } + + /* + * With VHE enabled, we have HCR_EL2.{E2H,TGE} = {1,1}, and + * most TLB operations target EL2/EL0. In order to affect the + * guest TLBs (EL1/EL0), we need to change one of these two + * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so + * let's flip TGE before executing the TLB operation. + * + * ARM erratum 1165522 requires some special handling (again), + * as we need to make sure both stages of translation are in + * place before clearing TGE. __load_stage2() already + * has an ISB in order to deal with this. + */ + __load_stage2(mmu, mmu->arch); + val = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + write_sysreg_hcr(val); + isb(); +} + +static void exit_vmid_context(struct tlb_inv_context *cxt) +{ + /* + * We're done with the TLB operation, let's restore the host's + * view of HCR_EL2. + */ + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); + isb(); + + /* ... and the stage-2 MMU context that we switched away from */ + if (cxt->mmu) + __load_stage2(cxt->mmu, cxt->mmu->arch); + + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { + /* Restore the registers to what they were */ + write_sysreg_el1(cxt->tcr, SYS_TCR); + write_sysreg_el1(cxt->sctlr, SYS_SCTLR); + } + + local_irq_restore(cxt->flags); +} + +void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + dsb(ishst); + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1is, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(ish); + __tlbi(vmalle1is); + dsb(ish); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + dsb(nshst); + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, + phys_addr_t start, unsigned long pages) +{ + struct tlb_inv_context cxt; + unsigned long stride; + + /* + * Since the range of addresses may not be mapped at + * the same level, assume the worst case as PAGE_SIZE + */ + stride = PAGE_SIZE; + start = round_down(start, stride); + + dsb(ishst); + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt); + + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); + + dsb(ish); + __tlbi(vmalle1is); + dsb(ish); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) +{ + struct tlb_inv_context cxt; + + dsb(ishst); + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt); + + __tlbi(vmalls12e1is); + dsb(ish); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) +{ + struct tlb_inv_context cxt; + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt); + + __tlbi(vmalle1); + asm volatile("ic iallu"); + dsb(nsh); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_flush_vm_context(void) +{ + dsb(ishst); + __tlbi(alle1is); + dsb(ish); +} + +/* + * TLB invalidation emulation for NV. For any given instruction, we + * perform the following transformtions: + * + * - a TLBI targeting EL2 S1 is remapped to EL1 S1 + * - a non-shareable TLBI is upgraded to being inner-shareable + * - an outer-shareable TLBI is also mapped to inner-shareable + * - an nXS TLBI is upgraded to XS + */ +int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding) +{ + struct tlb_inv_context cxt; + int ret = 0; + + /* + * The guest will have provided its own DSB ISHST before trapping. + * If it hasn't, that's its own problem, and we won't paper over it + * (plus, there is plenty of extra synchronisation before we even + * get here...). + */ + + if (mmu) + enter_vmid_context(mmu, &cxt); + + switch (sys_encoding) { + case OP_TLBI_ALLE2: + case OP_TLBI_ALLE2IS: + case OP_TLBI_ALLE2OS: + case OP_TLBI_VMALLE1: + case OP_TLBI_VMALLE1IS: + case OP_TLBI_VMALLE1OS: + case OP_TLBI_ALLE2NXS: + case OP_TLBI_ALLE2ISNXS: + case OP_TLBI_ALLE2OSNXS: + case OP_TLBI_VMALLE1NXS: + case OP_TLBI_VMALLE1ISNXS: + case OP_TLBI_VMALLE1OSNXS: + __tlbi(vmalle1is); + break; + case OP_TLBI_VAE2: + case OP_TLBI_VAE2IS: + case OP_TLBI_VAE2OS: + case OP_TLBI_VAE1: + case OP_TLBI_VAE1IS: + case OP_TLBI_VAE1OS: + case OP_TLBI_VAE2NXS: + case OP_TLBI_VAE2ISNXS: + case OP_TLBI_VAE2OSNXS: + case OP_TLBI_VAE1NXS: + case OP_TLBI_VAE1ISNXS: + case OP_TLBI_VAE1OSNXS: + __tlbi(vae1is, va); + break; + case OP_TLBI_VALE2: + case OP_TLBI_VALE2IS: + case OP_TLBI_VALE2OS: + case OP_TLBI_VALE1: + case OP_TLBI_VALE1IS: + case OP_TLBI_VALE1OS: + case OP_TLBI_VALE2NXS: + case OP_TLBI_VALE2ISNXS: + case OP_TLBI_VALE2OSNXS: + case OP_TLBI_VALE1NXS: + case OP_TLBI_VALE1ISNXS: + case OP_TLBI_VALE1OSNXS: + __tlbi(vale1is, va); + break; + case OP_TLBI_ASIDE1: + case OP_TLBI_ASIDE1IS: + case OP_TLBI_ASIDE1OS: + case OP_TLBI_ASIDE1NXS: + case OP_TLBI_ASIDE1ISNXS: + case OP_TLBI_ASIDE1OSNXS: + __tlbi(aside1is, va); + break; + case OP_TLBI_VAAE1: + case OP_TLBI_VAAE1IS: + case OP_TLBI_VAAE1OS: + case OP_TLBI_VAAE1NXS: + case OP_TLBI_VAAE1ISNXS: + case OP_TLBI_VAAE1OSNXS: + __tlbi(vaae1is, va); + break; + case OP_TLBI_VAALE1: + case OP_TLBI_VAALE1IS: + case OP_TLBI_VAALE1OS: + case OP_TLBI_VAALE1NXS: + case OP_TLBI_VAALE1ISNXS: + case OP_TLBI_VAALE1OSNXS: + __tlbi(vaale1is, va); + break; + case OP_TLBI_RVAE2: + case OP_TLBI_RVAE2IS: + case OP_TLBI_RVAE2OS: + case OP_TLBI_RVAE1: + case OP_TLBI_RVAE1IS: + case OP_TLBI_RVAE1OS: + case OP_TLBI_RVAE2NXS: + case OP_TLBI_RVAE2ISNXS: + case OP_TLBI_RVAE2OSNXS: + case OP_TLBI_RVAE1NXS: + case OP_TLBI_RVAE1ISNXS: + case OP_TLBI_RVAE1OSNXS: + __tlbi(rvae1is, va); + break; + case OP_TLBI_RVALE2: + case OP_TLBI_RVALE2IS: + case OP_TLBI_RVALE2OS: + case OP_TLBI_RVALE1: + case OP_TLBI_RVALE1IS: + case OP_TLBI_RVALE1OS: + case OP_TLBI_RVALE2NXS: + case OP_TLBI_RVALE2ISNXS: + case OP_TLBI_RVALE2OSNXS: + case OP_TLBI_RVALE1NXS: + case OP_TLBI_RVALE1ISNXS: + case OP_TLBI_RVALE1OSNXS: + __tlbi(rvale1is, va); + break; + case OP_TLBI_RVAAE1: + case OP_TLBI_RVAAE1IS: + case OP_TLBI_RVAAE1OS: + case OP_TLBI_RVAAE1NXS: + case OP_TLBI_RVAAE1ISNXS: + case OP_TLBI_RVAAE1OSNXS: + __tlbi(rvaae1is, va); + break; + case OP_TLBI_RVAALE1: + case OP_TLBI_RVAALE1IS: + case OP_TLBI_RVAALE1OS: + case OP_TLBI_RVAALE1NXS: + case OP_TLBI_RVAALE1ISNXS: + case OP_TLBI_RVAALE1OSNXS: + __tlbi(rvaale1is, va); + break; + default: + ret = -EINVAL; + } + dsb(ish); + isb(); + + if (mmu) + exit_vmid_context(&cxt); + + return ret; +} diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c new file mode 100644 index 000000000000..58c5fe7d7572 --- /dev/null +++ b/arch/arm64/kvm/hypercalls.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Arm Ltd. + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> + +#include <kvm/arm_hypercalls.h> +#include <kvm/arm_psci.h> + +#define KVM_ARM_SMCCC_STD_FEATURES \ + GENMASK(KVM_REG_ARM_STD_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_STD_HYP_FEATURES \ + GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES \ + GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2 \ + GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT - 1, 0) + +static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) +{ + struct system_time_snapshot systime_snapshot; + u64 cycles = ~0UL; + u32 feature; + + /* + * system time and counter value must captured at the same + * time to keep consistency and precision. + */ + ktime_get_snapshot(&systime_snapshot); + + /* + * This is only valid if the current clocksource is the + * architected counter, as this is the only one the guest + * can see. + */ + if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER) + return; + + /* + * The guest selects one of the two reference counters + * (virtual or physical) with the first argument of the SMCCC + * call. In case the identifier is not supported, error out. + */ + feature = smccc_get_arg1(vcpu); + switch (feature) { + case KVM_PTP_VIRT_COUNTER: + cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.voffset; + break; + case KVM_PTP_PHYS_COUNTER: + cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.poffset; + break; + default: + return; + } + + /* + * This relies on the top bit of val[0] never being set for + * valid values of system time, because that is *really* far + * in the future (about 292 years from 1970, and at that stage + * nobody will give a damn about it). + */ + val[0] = upper_32_bits(systime_snapshot.real); + val[1] = lower_32_bits(systime_snapshot.real); + val[2] = upper_32_bits(cycles); + val[3] = lower_32_bits(cycles); +} + +static bool kvm_smccc_default_allowed(u32 func_id) +{ + switch (func_id) { + /* + * List of function-ids that are not gated with the bitmapped + * feature firmware registers, and are to be allowed for + * servicing the call by default. + */ + case ARM_SMCCC_VERSION_FUNC_ID: + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: + return true; + default: + /* PSCI 0.2 and up is in the 0:0x1f range */ + if (ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD && + ARM_SMCCC_FUNC_NUM(func_id) <= 0x1f) + return true; + + /* + * KVM's PSCI 0.1 doesn't comply with SMCCC, and has + * its own function-id base and range + */ + if (func_id >= KVM_PSCI_FN(0) && func_id <= KVM_PSCI_FN(3)) + return true; + + return false; + } +} + +static bool kvm_smccc_test_fw_bmap(struct kvm_vcpu *vcpu, u32 func_id) +{ + struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; + + switch (func_id) { + case ARM_SMCCC_TRNG_VERSION: + case ARM_SMCCC_TRNG_FEATURES: + case ARM_SMCCC_TRNG_GET_UUID: + case ARM_SMCCC_TRNG_RND32: + case ARM_SMCCC_TRNG_RND64: + return test_bit(KVM_REG_ARM_STD_BIT_TRNG_V1_0, + &smccc_feat->std_bmap); + case ARM_SMCCC_HV_PV_TIME_FEATURES: + case ARM_SMCCC_HV_PV_TIME_ST: + return test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME, + &smccc_feat->std_hyp_bmap); + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: + return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT, + &smccc_feat->vendor_hyp_bmap); + case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: + return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP, + &smccc_feat->vendor_hyp_bmap); + default: + return false; + } +} + +#define SMC32_ARCH_RANGE_BEGIN ARM_SMCCC_VERSION_FUNC_ID +#define SMC32_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + 0, ARM_SMCCC_FUNC_MASK) + +#define SMC64_ARCH_RANGE_BEGIN ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + 0, 0) +#define SMC64_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + 0, ARM_SMCCC_FUNC_MASK) + +static int kvm_smccc_filter_insert_reserved(struct kvm *kvm) +{ + int r; + + /* + * Prevent userspace from handling any SMCCC calls in the architecture + * range, avoiding the risk of misrepresenting Spectre mitigation status + * to the guest. + */ + r = mtree_insert_range(&kvm->arch.smccc_filter, + SMC32_ARCH_RANGE_BEGIN, SMC32_ARCH_RANGE_END, + xa_mk_value(KVM_SMCCC_FILTER_HANDLE), + GFP_KERNEL_ACCOUNT); + if (r) + goto out_destroy; + + r = mtree_insert_range(&kvm->arch.smccc_filter, + SMC64_ARCH_RANGE_BEGIN, SMC64_ARCH_RANGE_END, + xa_mk_value(KVM_SMCCC_FILTER_HANDLE), + GFP_KERNEL_ACCOUNT); + if (r) + goto out_destroy; + + return 0; +out_destroy: + mtree_destroy(&kvm->arch.smccc_filter); + return r; +} + +static bool kvm_smccc_filter_configured(struct kvm *kvm) +{ + return !mtree_empty(&kvm->arch.smccc_filter); +} + +static int kvm_smccc_set_filter(struct kvm *kvm, struct kvm_smccc_filter __user *uaddr) +{ + const void *zero_page = page_to_virt(ZERO_PAGE(0)); + struct kvm_smccc_filter filter; + u32 start, end; + int r; + + if (copy_from_user(&filter, uaddr, sizeof(filter))) + return -EFAULT; + + if (memcmp(filter.pad, zero_page, sizeof(filter.pad))) + return -EINVAL; + + start = filter.base; + end = start + filter.nr_functions - 1; + + if (end < start || filter.action >= NR_SMCCC_FILTER_ACTIONS) + return -EINVAL; + + mutex_lock(&kvm->arch.config_lock); + + if (kvm_vm_has_ran_once(kvm)) { + r = -EBUSY; + goto out_unlock; + } + + if (!kvm_smccc_filter_configured(kvm)) { + r = kvm_smccc_filter_insert_reserved(kvm); + if (WARN_ON_ONCE(r)) + goto out_unlock; + } + + r = mtree_insert_range(&kvm->arch.smccc_filter, start, end, + xa_mk_value(filter.action), GFP_KERNEL_ACCOUNT); +out_unlock: + mutex_unlock(&kvm->arch.config_lock); + return r; +} + +static u8 kvm_smccc_filter_get_action(struct kvm *kvm, u32 func_id) +{ + unsigned long idx = func_id; + void *val; + + if (!kvm_smccc_filter_configured(kvm)) + return KVM_SMCCC_FILTER_HANDLE; + + /* + * But where's the error handling, you say? + * + * mt_find() returns NULL if no entry was found, which just so happens + * to match KVM_SMCCC_FILTER_HANDLE. + */ + val = mt_find(&kvm->arch.smccc_filter, &idx, idx); + return xa_to_value(val); +} + +static u8 kvm_smccc_get_action(struct kvm_vcpu *vcpu, u32 func_id) +{ + /* + * Intervening actions in the SMCCC filter take precedence over the + * pseudo-firmware register bitmaps. + */ + u8 action = kvm_smccc_filter_get_action(vcpu->kvm, func_id); + if (action != KVM_SMCCC_FILTER_HANDLE) + return action; + + if (kvm_smccc_test_fw_bmap(vcpu, func_id) || + kvm_smccc_default_allowed(func_id)) + return KVM_SMCCC_FILTER_HANDLE; + + return KVM_SMCCC_FILTER_DENY; +} + +static void kvm_prepare_hypercall_exit(struct kvm_vcpu *vcpu, u32 func_id) +{ + u8 ec = ESR_ELx_EC(kvm_vcpu_get_esr(vcpu)); + struct kvm_run *run = vcpu->run; + u64 flags = 0; + + if (ec == ESR_ELx_EC_SMC32 || ec == ESR_ELx_EC_SMC64) + flags |= KVM_HYPERCALL_EXIT_SMC; + + if (!kvm_vcpu_trap_il_is32bit(vcpu)) + flags |= KVM_HYPERCALL_EXIT_16BIT; + + run->exit_reason = KVM_EXIT_HYPERCALL; + run->hypercall = (typeof(run->hypercall)) { + .nr = func_id, + .flags = flags, + }; +} + +int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) +{ + struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; + u32 func_id = smccc_get_function(vcpu); + u64 val[4] = {SMCCC_RET_NOT_SUPPORTED}; + u32 feature; + u8 action; + gpa_t gpa; + uuid_t uuid; + + action = kvm_smccc_get_action(vcpu, func_id); + switch (action) { + case KVM_SMCCC_FILTER_HANDLE: + break; + case KVM_SMCCC_FILTER_DENY: + goto out; + case KVM_SMCCC_FILTER_FWD_TO_USER: + kvm_prepare_hypercall_exit(vcpu, func_id); + return 0; + default: + WARN_RATELIMIT(1, "Unhandled SMCCC filter action: %d\n", action); + goto out; + } + + switch (func_id) { + case ARM_SMCCC_VERSION_FUNC_ID: + val[0] = ARM_SMCCC_VERSION_1_1; + break; + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: + feature = smccc_get_arg1(vcpu); + switch (feature) { + case ARM_SMCCC_ARCH_WORKAROUND_1: + switch (arm64_get_spectre_v2_state()) { + case SPECTRE_VULNERABLE: + break; + case SPECTRE_MITIGATED: + val[0] = SMCCC_RET_SUCCESS; + break; + case SPECTRE_UNAFFECTED: + val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + break; + } + break; + case ARM_SMCCC_ARCH_WORKAROUND_2: + switch (arm64_get_spectre_v4_state()) { + case SPECTRE_VULNERABLE: + break; + case SPECTRE_MITIGATED: + /* + * SSBS everywhere: Indicate no firmware + * support, as the SSBS support will be + * indicated to the guest and the default is + * safe. + * + * Otherwise, expose a permanent mitigation + * to the guest, and hide SSBS so that the + * guest stays protected. + */ + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) + break; + fallthrough; + case SPECTRE_UNAFFECTED: + val[0] = SMCCC_RET_NOT_REQUIRED; + break; + } + break; + case ARM_SMCCC_ARCH_WORKAROUND_3: + switch (arm64_get_spectre_bhb_state()) { + case SPECTRE_VULNERABLE: + break; + case SPECTRE_MITIGATED: + val[0] = SMCCC_RET_SUCCESS; + break; + case SPECTRE_UNAFFECTED: + val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + break; + } + break; + case ARM_SMCCC_HV_PV_TIME_FEATURES: + if (test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME, + &smccc_feat->std_hyp_bmap)) + val[0] = SMCCC_RET_SUCCESS; + break; + } + break; + case ARM_SMCCC_HV_PV_TIME_FEATURES: + val[0] = kvm_hypercall_pv_features(vcpu); + break; + case ARM_SMCCC_HV_PV_TIME_ST: + gpa = kvm_init_stolen_time(vcpu); + if (gpa != INVALID_GPA) + val[0] = gpa; + break; + case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: + uuid = ARM_SMCCC_VENDOR_HYP_UID_KVM; + val[0] = smccc_uuid_to_reg(&uuid, 0); + val[1] = smccc_uuid_to_reg(&uuid, 1); + val[2] = smccc_uuid_to_reg(&uuid, 2); + val[3] = smccc_uuid_to_reg(&uuid, 3); + break; + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + val[0] = smccc_feat->vendor_hyp_bmap; + /* Function numbers 2-63 are reserved for pKVM for now */ + val[2] = smccc_feat->vendor_hyp_bmap_2; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: + kvm_ptp_get_time(vcpu, val); + break; + case ARM_SMCCC_TRNG_VERSION: + case ARM_SMCCC_TRNG_FEATURES: + case ARM_SMCCC_TRNG_GET_UUID: + case ARM_SMCCC_TRNG_RND32: + case ARM_SMCCC_TRNG_RND64: + return kvm_trng_call(vcpu); + default: + return kvm_psci_call(vcpu); + } + +out: + smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); + return 1; +} + +static const u64 kvm_arm_fw_reg_ids[] = { + KVM_REG_ARM_PSCI_VERSION, + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3, + KVM_REG_ARM_STD_BMAP, + KVM_REG_ARM_STD_HYP_BMAP, + KVM_REG_ARM_VENDOR_HYP_BMAP, + KVM_REG_ARM_VENDOR_HYP_BMAP_2, +}; + +void kvm_arm_init_hypercalls(struct kvm *kvm) +{ + struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat; + + smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES; + smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES; + smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; + + mt_init(&kvm->arch.smccc_filter); +} + +void kvm_arm_teardown_hypercalls(struct kvm *kvm) +{ + mtree_destroy(&kvm->arch.smccc_filter); +} + +int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) +{ + return ARRAY_SIZE(kvm_arm_fw_reg_ids); +} + +int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(kvm_arm_fw_reg_ids); i++) { + if (put_user(kvm_arm_fw_reg_ids[i], uindices++)) + return -EFAULT; + } + + return 0; +} + +#define KVM_REG_FEATURE_LEVEL_MASK GENMASK(3, 0) + +/* + * Convert the workaround level into an easy-to-compare number, where higher + * values mean better protection. + */ +static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid) +{ + switch (regid) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + switch (arm64_get_spectre_v2_state()) { + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case SPECTRE_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + switch (arm64_get_spectre_v4_state()) { + case SPECTRE_MITIGATED: + /* + * As for the hypercall discovery, we pretend we + * don't have any FW mitigation if SSBS is there at + * all times. + */ + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + fallthrough; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + } + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + switch (arm64_get_spectre_bhb_state()) { + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; + case SPECTRE_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; + } + + return -EINVAL; +} + +int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + val = kvm_psci_version(vcpu); + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + break; + case KVM_REG_ARM_STD_BMAP: + val = READ_ONCE(smccc_feat->std_bmap); + break; + case KVM_REG_ARM_STD_HYP_BMAP: + val = READ_ONCE(smccc_feat->std_hyp_bmap); + break; + case KVM_REG_ARM_VENDOR_HYP_BMAP: + val = READ_ONCE(smccc_feat->vendor_hyp_bmap); + break; + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: + val = READ_ONCE(smccc_feat->vendor_hyp_bmap_2); + break; + default: + return -ENOENT; + } + + if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val) +{ + int ret = 0; + struct kvm *kvm = vcpu->kvm; + struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat; + unsigned long *fw_reg_bmap, fw_reg_features; + + switch (reg_id) { + case KVM_REG_ARM_STD_BMAP: + fw_reg_bmap = &smccc_feat->std_bmap; + fw_reg_features = KVM_ARM_SMCCC_STD_FEATURES; + break; + case KVM_REG_ARM_STD_HYP_BMAP: + fw_reg_bmap = &smccc_feat->std_hyp_bmap; + fw_reg_features = KVM_ARM_SMCCC_STD_HYP_FEATURES; + break; + case KVM_REG_ARM_VENDOR_HYP_BMAP: + fw_reg_bmap = &smccc_feat->vendor_hyp_bmap; + fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; + break; + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: + fw_reg_bmap = &smccc_feat->vendor_hyp_bmap_2; + fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2; + break; + default: + return -ENOENT; + } + + /* Check for unsupported bit */ + if (val & ~fw_reg_features) + return -EINVAL; + + mutex_lock(&kvm->arch.config_lock); + + if (kvm_vm_has_ran_once(kvm) && val != *fw_reg_bmap) { + ret = -EBUSY; + goto out; + } + + WRITE_ONCE(*fw_reg_bmap, val); +out: + mutex_unlock(&kvm->arch.config_lock); + return ret; +} + +int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + int wa_level; + + if (KVM_REG_SIZE(reg->id) != sizeof(val)) + return -ENOENT; + if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + { + bool wants_02; + + wants_02 = vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2); + + switch (val) { + case KVM_ARM_PSCI_0_1: + if (wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + case KVM_ARM_PSCI_0_2: + case KVM_ARM_PSCI_1_0: + case KVM_ARM_PSCI_1_1: + case KVM_ARM_PSCI_1_2: + case KVM_ARM_PSCI_1_3: + if (!wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + } + break; + } + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + if (val & ~KVM_REG_FEATURE_LEVEL_MASK) + return -EINVAL; + + if (get_kernel_wa_level(vcpu, reg->id) < val) + return -EINVAL; + + return 0; + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + if (val & ~(KVM_REG_FEATURE_LEVEL_MASK | + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED)) + return -EINVAL; + + /* The enabled bit must not be set unless the level is AVAIL. */ + if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) && + (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL) + return -EINVAL; + + /* + * Map all the possible incoming states to the only two we + * really want to deal with. + */ + switch (val & KVM_REG_FEATURE_LEVEL_MASK) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: + wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: + wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; + break; + default: + return -EINVAL; + } + + /* + * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the + * other way around. + */ + if (get_kernel_wa_level(vcpu, reg->id) < wa_level) + return -EINVAL; + + return 0; + case KVM_REG_ARM_STD_BMAP: + case KVM_REG_ARM_STD_HYP_BMAP: + case KVM_REG_ARM_VENDOR_HYP_BMAP: + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: + return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val); + default: + return -ENOENT; + } + + return -EINVAL; +} + +int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VM_SMCCC_FILTER: + return 0; + default: + return -ENXIO; + } +} + +int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + void __user *uaddr = (void __user *)attr->addr; + + switch (attr->attr) { + case KVM_ARM_VM_SMCCC_FILTER: + return kvm_smccc_set_filter(kvm, uaddr); + default: + return -ENXIO; + } +} diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 81a02a8762b0..dfcd66c65517 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Fault injection for both 32 and 64bit guests. * @@ -7,197 +8,355 @@ * Based on arch/arm/kvm/emulate.c * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> #include <asm/esr.h> -#define PSTATE_FAULT_BITS_64 (PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | \ - PSR_I_BIT | PSR_D_BIT) -#define EL1_EXCEPT_SYNC_OFFSET 0x200 - -static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) +static unsigned int exception_target_el(struct kvm_vcpu *vcpu) { - unsigned long cpsr; - unsigned long new_spsr_value = *vcpu_cpsr(vcpu); - bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT); - u32 return_offset = (is_thumb) ? 4 : 0; - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); + /* If not nesting, EL1 is the only possible exception target */ + if (likely(!vcpu_has_nv(vcpu))) + return PSR_MODE_EL1h; - cpsr = mode | COMPAT_PSR_I_BIT; - - if (sctlr & (1 << 30)) - cpsr |= COMPAT_PSR_T_BIT; - if (sctlr & (1 << 25)) - cpsr |= COMPAT_PSR_E_BIT; + /* + * With NV, we need to pick between EL1 and EL2. Note that we + * never deal with a nesting exception here, hence never + * changing context, and the exception itself can be delayed + * until the next entry. + */ + switch(*vcpu_cpsr(vcpu) & PSR_MODE_MASK) { + case PSR_MODE_EL2h: + case PSR_MODE_EL2t: + return PSR_MODE_EL2h; + case PSR_MODE_EL1h: + case PSR_MODE_EL1t: + return PSR_MODE_EL1h; + case PSR_MODE_EL0t: + return vcpu_el2_tge_is_set(vcpu) ? PSR_MODE_EL2h : PSR_MODE_EL1h; + default: + BUG(); + } +} - *vcpu_cpsr(vcpu) = cpsr; +static enum vcpu_sysreg exception_esr_elx(struct kvm_vcpu *vcpu) +{ + if (exception_target_el(vcpu) == PSR_MODE_EL2h) + return ESR_EL2; - /* Note: These now point to the banked copies */ - *vcpu_spsr(vcpu) = new_spsr_value; - *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; + return ESR_EL1; +} - /* Branch to exception vector */ - if (sctlr & (1 << 13)) - vect_offset += 0xffff0000; - else /* always have security exceptions */ - vect_offset += vcpu_cp15(vcpu, c12_VBAR); +static enum vcpu_sysreg exception_far_elx(struct kvm_vcpu *vcpu) +{ + if (exception_target_el(vcpu) == PSR_MODE_EL2h) + return FAR_EL2; - *vcpu_pc(vcpu) = vect_offset; + return FAR_EL1; } -static void inject_undef32(struct kvm_vcpu *vcpu) +static void pend_sync_exception(struct kvm_vcpu *vcpu) { - prepare_fault32(vcpu, COMPAT_PSR_MODE_UND, 4); + if (exception_target_el(vcpu) == PSR_MODE_EL1h) + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + else + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC); } -/* - * Modelled after TakeDataAbortException() and TakePrefetchAbortException - * pseudocode. - */ -static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, - unsigned long addr) +static void pend_serror_exception(struct kvm_vcpu *vcpu) { - u32 vect_offset; - u32 *far, *fsr; - bool is_lpae; + if (exception_target_el(vcpu) == PSR_MODE_EL1h) + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SERR); + else + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR); +} - if (is_pabt) { - vect_offset = 12; - far = &vcpu_cp15(vcpu, c6_IFAR); - fsr = &vcpu_cp15(vcpu, c5_IFSR); - } else { /* !iabt */ - vect_offset = 16; - far = &vcpu_cp15(vcpu, c6_DFAR); - fsr = &vcpu_cp15(vcpu, c5_DFSR); - } +static bool __effective_sctlr2_bit(struct kvm_vcpu *vcpu, unsigned int idx) +{ + u64 sctlr2; - prepare_fault32(vcpu, COMPAT_PSR_MODE_ABT | COMPAT_PSR_A_BIT, vect_offset); + if (!kvm_has_sctlr2(vcpu->kvm)) + return false; - *far = addr; + if (is_nested_ctxt(vcpu) && + !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_SCTLR2En)) + return false; - /* Give the guest an IMPLEMENTATION DEFINED exception */ - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); - if (is_lpae) - *fsr = 1 << 9 | 0x34; + if (exception_target_el(vcpu) == PSR_MODE_EL1h) + sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL1); else - *fsr = 0x14; + sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL2); + + return sctlr2 & BIT(idx); +} + +static bool effective_sctlr2_ease(struct kvm_vcpu *vcpu) +{ + return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_EASE_SHIFT); +} + +static bool effective_sctlr2_nmea(struct kvm_vcpu *vcpu) +{ + return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_NMEA_SHIFT); } static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) { unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_aarch32; - u32 esr = 0; + bool is_aarch32 = vcpu_mode_is_32bit(vcpu); + u64 esr = 0, fsc; + int level; + + /* + * If injecting an abort from a failed S1PTW, rewalk the S1 PTs to + * find the failing level. If we can't find it, assume the error was + * transient and restart without changing the state. + */ + if (kvm_vcpu_abt_iss1tw(vcpu)) { + u64 hpfar = kvm_vcpu_get_fault_ipa(vcpu); + int ret; - is_aarch32 = vcpu_mode_is_32bit(vcpu); + if (hpfar == INVALID_GPA) + return; - *vcpu_spsr(vcpu) = cpsr; - *vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu); + ret = __kvm_find_s1_desc_level(vcpu, addr, hpfar, &level); + if (ret) + return; - *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; - *vcpu_pc(vcpu) = vcpu_sys_reg(vcpu, VBAR_EL1) + EL1_EXCEPT_SYNC_OFFSET; + WARN_ON_ONCE(level < -1 || level > 3); + fsc = ESR_ELx_FSC_SEA_TTW(level); + } else { + fsc = ESR_ELx_FSC_EXTABT; + } - vcpu_sys_reg(vcpu, FAR_EL1) = addr; + /* This delight is brought to you by FEAT_DoubleFault2. */ + if (effective_sctlr2_ease(vcpu)) + pend_serror_exception(vcpu); + else + pend_sync_exception(vcpu); /* * Build an {i,d}abort, depending on the level and the * instruction set. Report an external synchronous abort. */ if (kvm_vcpu_trap_il_is32bit(vcpu)) - esr |= ESR_EL1_IL; + esr |= ESR_ELx_IL; /* * Here, the guest runs in AArch64 mode when in EL1. If we get * an AArch32 fault, it means we managed to trap an EL0 fault. */ if (is_aarch32 || (cpsr & PSR_MODE_MASK) == PSR_MODE_EL0t) - esr |= (ESR_EL1_EC_IABT_EL0 << ESR_EL1_EC_SHIFT); + esr |= (ESR_ELx_EC_IABT_LOW << ESR_ELx_EC_SHIFT); else - esr |= (ESR_EL1_EC_IABT_EL1 << ESR_EL1_EC_SHIFT); + esr |= (ESR_ELx_EC_IABT_CUR << ESR_ELx_EC_SHIFT); if (!is_iabt) - esr |= ESR_EL1_EC_DABT_EL0; + esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT; - vcpu_sys_reg(vcpu, ESR_EL1) = esr | ESR_EL2_EC_xABT_xFSR_EXTABT; + esr |= fsc; + + vcpu_write_sys_reg(vcpu, addr, exception_far_elx(vcpu)); + vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); } static void inject_undef64(struct kvm_vcpu *vcpu) { - unsigned long cpsr = *vcpu_cpsr(vcpu); - u32 esr = (ESR_EL1_EC_UNKNOWN << ESR_EL1_EC_SHIFT); + u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - *vcpu_spsr(vcpu) = cpsr; - *vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu); - - *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; - *vcpu_pc(vcpu) = vcpu_sys_reg(vcpu, VBAR_EL1) + EL1_EXCEPT_SYNC_OFFSET; + pend_sync_exception(vcpu); /* * Build an unknown exception, depending on the instruction * set. */ if (kvm_vcpu_trap_il_is32bit(vcpu)) - esr |= ESR_EL1_IL; + esr |= ESR_ELx_IL; - vcpu_sys_reg(vcpu, ESR_EL1) = esr; + vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); } -/** - * kvm_inject_dabt - inject a data abort into the guest - * @vcpu: The VCPU to receive the undefined exception - * @addr: The address to report in the DFAR - * - * It is assumed that this code is called from the VCPU thread and that the - * VCPU therefore is not currently executing guest code. +#define DFSR_FSC_EXTABT_LPAE 0x10 +#define DFSR_FSC_EXTABT_nLPAE 0x08 +#define DFSR_LPAE BIT(9) +#define TTBCR_EAE BIT(31) + +static void inject_undef32(struct kvm_vcpu *vcpu) +{ + kvm_pend_exception(vcpu, EXCEPT_AA32_UND); +} + +/* + * Modelled after TakeDataAbortException() and TakePrefetchAbortException + * pseudocode. */ -void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) +static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) { - if (!(vcpu->arch.hcr_el2 & HCR_RW)) - inject_abt32(vcpu, false, addr); + u64 far; + u32 fsr; + + /* Give the guest an IMPLEMENTATION DEFINED exception */ + if (vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE) { + fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; + } else { + /* no need to shuffle FS[4] into DFSR[10] as it's 0 */ + fsr = DFSR_FSC_EXTABT_nLPAE; + } + + far = vcpu_read_sys_reg(vcpu, FAR_EL1); - inject_abt64(vcpu, false, addr); + if (is_pabt) { + kvm_pend_exception(vcpu, EXCEPT_AA32_IABT); + far &= GENMASK(31, 0); + far |= (u64)addr << 32; + vcpu_write_sys_reg(vcpu, fsr, IFSR32_EL2); + } else { /* !iabt */ + kvm_pend_exception(vcpu, EXCEPT_AA32_DABT); + far &= GENMASK(63, 32); + far |= addr; + vcpu_write_sys_reg(vcpu, fsr, ESR_EL1); + } + + vcpu_write_sys_reg(vcpu, far, FAR_EL1); } -/** - * kvm_inject_pabt - inject a prefetch abort into the guest - * @vcpu: The VCPU to receive the undefined exception - * @addr: The address to report in the DFAR - * - * It is assumed that this code is called from the VCPU thread and that the - * VCPU therefore is not currently executing guest code. - */ -void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) +static void __kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr) +{ + if (vcpu_el1_is_32bit(vcpu)) + inject_abt32(vcpu, iabt, addr); + else + inject_abt64(vcpu, iabt, addr); +} + +static bool kvm_sea_target_is_el2(struct kvm_vcpu *vcpu) +{ + if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_TGE | HCR_TEA)) + return true; + + if (!vcpu_mode_priv(vcpu)) + return false; + + return (*vcpu_cpsr(vcpu) & PSR_A_BIT) && + (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA); +} + +int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr) +{ + lockdep_assert_held(&vcpu->mutex); + + if (is_nested_ctxt(vcpu) && kvm_sea_target_is_el2(vcpu)) + return kvm_inject_nested_sea(vcpu, iabt, addr); + + __kvm_inject_sea(vcpu, iabt, addr); + return 1; +} + +void kvm_inject_size_fault(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.hcr_el2 & HCR_RW)) - inject_abt32(vcpu, true, addr); + unsigned long addr, esr; + + addr = kvm_vcpu_get_fault_ipa(vcpu); + addr |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + + __kvm_inject_sea(vcpu, kvm_vcpu_trap_is_iabt(vcpu), addr); + + /* + * If AArch64 or LPAE, set FSC to 0 to indicate an Address + * Size Fault at level 0, as if exceeding PARange. + * + * Non-LPAE guests will only get the external abort, as there + * is no way to describe the ASF. + */ + if (vcpu_el1_is_32bit(vcpu) && + !(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE)) + return; - inject_abt64(vcpu, true, addr); + esr = vcpu_read_sys_reg(vcpu, exception_esr_elx(vcpu)); + esr &= ~GENMASK_ULL(5, 0); + vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); } /** * kvm_inject_undefined - inject an undefined instruction into the guest + * @vcpu: The vCPU in which to inject the exception * * It is assumed that this code is called from the VCPU thread and that the * VCPU therefore is not currently executing guest code. */ void kvm_inject_undefined(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.hcr_el2 & HCR_RW)) + if (vcpu_el1_is_32bit(vcpu)) inject_undef32(vcpu); + else + inject_undef64(vcpu); +} + +static bool serror_is_masked(struct kvm_vcpu *vcpu) +{ + return (*vcpu_cpsr(vcpu) & PSR_A_BIT) && !effective_sctlr2_nmea(vcpu); +} + +static bool kvm_serror_target_is_el2(struct kvm_vcpu *vcpu) +{ + if (is_hyp_ctxt(vcpu) || vcpu_el2_amo_is_set(vcpu)) + return true; + + if (!(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA)) + return false; + + /* + * In another example where FEAT_DoubleFault2 is entirely backwards, + * "masked" as it relates to the routing effects of HCRX_EL2.TMEA + * doesn't consider SCTLR2_EL1.NMEA. That is to say, even if EL1 asked + * for non-maskable SErrors, the EL2 bit takes priority if A is set. + */ + if (vcpu_mode_priv(vcpu)) + return *vcpu_cpsr(vcpu) & PSR_A_BIT; + + /* + * Otherwise SErrors are considered unmasked when taken from EL0 and + * NMEA is set. + */ + return serror_is_masked(vcpu); +} + +static bool kvm_serror_undeliverable_at_el2(struct kvm_vcpu *vcpu) +{ + return !(vcpu_el2_tge_is_set(vcpu) || vcpu_el2_amo_is_set(vcpu)); +} + +int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr) +{ + lockdep_assert_held(&vcpu->mutex); + + if (is_nested_ctxt(vcpu) && kvm_serror_target_is_el2(vcpu)) + return kvm_inject_nested_serror(vcpu, esr); + + if (vcpu_is_el2(vcpu) && kvm_serror_undeliverable_at_el2(vcpu)) { + vcpu_set_vsesr(vcpu, esr); + vcpu_set_flag(vcpu, NESTED_SERROR_PENDING); + return 1; + } + + /* + * Emulate the exception entry if SErrors are unmasked. This is useful if + * the vCPU is in a nested context w/ vSErrors enabled then we've already + * delegated he hardware vSError context (i.e. HCR_EL2.VSE, VSESR_EL2, + * VDISR_EL2) to the guest hypervisor. + * + * As we're emulating the SError injection we need to explicitly populate + * ESR_ELx.EC because hardware will not do it on our behalf. + */ + if (!serror_is_masked(vcpu)) { + pend_serror_exception(vcpu); + esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR); + vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); + return 1; + } - inject_undef64(vcpu); + vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK); + *vcpu_hcr(vcpu) |= HCR_VSE; + return 1; } diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c new file mode 100644 index 000000000000..54f9358c9e0e --- /dev/null +++ b/arch/arm64/kvm/mmio.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <trace/events/kvm.h> + +#include "trace.h" + +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) +{ + void *datap = NULL; + union { + u8 byte; + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + tmp.byte = data; + datap = &tmp.byte; + break; + case 2: + tmp.hword = data; + datap = &tmp.hword; + break; + case 4: + tmp.word = data; + datap = &tmp.word; + break; + case 8: + tmp.dword = data; + datap = &tmp.dword; + break; + } + + memcpy(buf, datap, len); +} + +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) +{ + unsigned long data = 0; + union { + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + data = *(u8 *)buf; + break; + case 2: + memcpy(&tmp.hword, buf, len); + data = tmp.hword; + break; + case 4: + memcpy(&tmp.word, buf, len); + data = tmp.word; + break; + case 8: + memcpy(&tmp.dword, buf, len); + data = tmp.dword; + break; + } + + return data; +} + +static bool kvm_pending_external_abort(struct kvm_vcpu *vcpu) +{ + if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) + return false; + + if (vcpu_el1_is_32bit(vcpu)) { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA32_UND): + case unpack_vcpu_flag(EXCEPT_AA32_IABT): + case unpack_vcpu_flag(EXCEPT_AA32_DABT): + return true; + default: + return false; + } + } else { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR): + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR): + return true; + default: + return false; + } + } +} + +/** + * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation + * or in-kernel IO emulation + * + * @vcpu: The VCPU pointer + */ +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) +{ + unsigned long data; + unsigned int len; + int mask; + + /* + * Detect if the MMIO return was already handled or if userspace aborted + * the MMIO access. + */ + if (unlikely(!vcpu->mmio_needed || kvm_pending_external_abort(vcpu))) + return 1; + + vcpu->mmio_needed = 0; + + if (!kvm_vcpu_dabt_iswrite(vcpu)) { + struct kvm_run *run = vcpu->run; + + len = kvm_vcpu_dabt_get_as(vcpu); + data = kvm_mmio_read_buf(run->mmio.data, len); + + if (kvm_vcpu_dabt_issext(vcpu) && + len < sizeof(unsigned long)) { + mask = 1U << ((len * 8) - 1); + data = (data ^ mask) - mask; + } + + if (!kvm_vcpu_dabt_issf(vcpu)) + data = data & 0xffffffff; + + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, + &data); + data = vcpu_data_host_to_guest(vcpu, data, len); + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); + } + + /* + * The MMIO instruction is emulated and should not be re-executed + * in the guest. + */ + kvm_incr_pc(vcpu); + + return 1; +} + +int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) +{ + struct kvm_run *run = vcpu->run; + unsigned long data; + unsigned long rt; + int ret; + bool is_write; + int len; + u8 data_buf[8]; + + /* + * No valid syndrome? Ask userspace for help if it has + * volunteered to do so, and bail out otherwise. + * + * In the protected VM case, there isn't much userspace can do + * though, so directly deliver an exception to the guest. + */ + if (!kvm_vcpu_dabt_isvalid(vcpu)) { + trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), + kvm_vcpu_get_hfar(vcpu), fault_ipa); + + if (vcpu_is_protected(vcpu)) + return kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + + if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, + &vcpu->kvm->arch.flags)) { + run->exit_reason = KVM_EXIT_ARM_NISV; + run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); + run->arm_nisv.fault_ipa = fault_ipa; + return 0; + } + + return -ENOSYS; + } + + /* + * Prepare MMIO operation. First decode the syndrome data we get + * from the CPU. Then try if some in-kernel emulation feels + * responsible, otherwise let user space do its magic. + */ + is_write = kvm_vcpu_dabt_iswrite(vcpu); + len = kvm_vcpu_dabt_get_as(vcpu); + rt = kvm_vcpu_dabt_get_rd(vcpu); + + if (is_write) { + data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), + len); + + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); + kvm_mmio_write_buf(data_buf, len, data); + + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); + } else { + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, + fault_ipa, NULL); + + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); + } + + /* Now prepare kvm_run for the potential return to userland. */ + run->mmio.is_write = is_write; + run->mmio.phys_addr = fault_ipa; + run->mmio.len = len; + vcpu->mmio_needed = 1; + + if (!ret) { + /* We handled the access successfully in the kernel. */ + if (!is_write) + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_kernel++; + kvm_handle_mmio_return(vcpu); + return 1; + } + + if (is_write) + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_user++; + run->exit_reason = KVM_EXIT_MMIO; + return 0; +} diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c new file mode 100644 index 000000000000..48d7c372a4cd --- /dev/null +++ b/arch/arm64/kvm/mmu.c @@ -0,0 +1,2575 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + */ + +#include <linux/acpi.h> +#include <linux/mman.h> +#include <linux/kvm_host.h> +#include <linux/io.h> +#include <linux/hugetlb.h> +#include <linux/sched/signal.h> +#include <trace/events/kvm.h> +#include <asm/acpi.h> +#include <asm/pgalloc.h> +#include <asm/cacheflush.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/virt.h> + +#include "trace.h" + +static struct kvm_pgtable *hyp_pgtable; +static DEFINE_MUTEX(kvm_hyp_pgd_mutex); + +static unsigned long __ro_after_init hyp_idmap_start; +static unsigned long __ro_after_init hyp_idmap_end; +static phys_addr_t __ro_after_init hyp_idmap_vector; + +u32 __ro_after_init __hyp_va_bits; + +static unsigned long __ro_after_init io_map_base; + +#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) + +static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, + phys_addr_t size) +{ + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} + +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + + return __stage2_range_addr_end(addr, end, size); +} + +/* + * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, + * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, + * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too + * long will also starve other vCPUs. We have to also make sure that the page + * tables are not freed while we released the lock. + */ +static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, + phys_addr_t end, + int (*fn)(struct kvm_pgtable *, u64, u64), + bool resched) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); + int ret; + u64 next; + + do { + struct kvm_pgtable *pgt = mmu->pgt; + if (!pgt) + return -EINVAL; + + next = stage2_range_addr_end(addr, end); + ret = fn(pgt, addr, next - addr); + if (ret) + break; + + if (resched && next != end) + cond_resched_rwlock_write(&kvm->mmu_lock); + } while (addr = next, addr != end); + + return ret; +} + +#define stage2_apply_range_resched(mmu, addr, end, fn) \ + stage2_apply_range(mmu, addr, end, fn, true) + +/* + * Get the maximum number of page-tables pages needed to split a range + * of blocks into PAGE_SIZE PTEs. It assumes the range is already + * mapped at level 2, or at level 1 if allowed. + */ +static int kvm_mmu_split_nr_page_tables(u64 range) +{ + int n = 0; + + if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) + n += DIV_ROUND_UP(range, PUD_SIZE); + n += DIV_ROUND_UP(range, PMD_SIZE); + return n; +} + +static bool need_split_memcache_topup_or_resched(struct kvm *kvm) +{ + struct kvm_mmu_memory_cache *cache; + u64 chunk_size, min; + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; + + chunk_size = kvm->arch.mmu.split_page_chunk_size; + min = kvm_mmu_split_nr_page_tables(chunk_size); + cache = &kvm->arch.mmu.split_page_cache; + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, + phys_addr_t end) +{ + struct kvm_mmu_memory_cache *cache; + struct kvm_pgtable *pgt; + int ret, cache_capacity; + u64 next, chunk_size; + + lockdep_assert_held_write(&kvm->mmu_lock); + + chunk_size = kvm->arch.mmu.split_page_chunk_size; + cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); + + if (chunk_size == 0) + return 0; + + cache = &kvm->arch.mmu.split_page_cache; + + do { + if (need_split_memcache_topup_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* Eager page splitting is best-effort. */ + ret = __kvm_mmu_topup_memory_cache(cache, + cache_capacity, + cache_capacity); + write_lock(&kvm->mmu_lock); + if (ret) + break; + } + + pgt = kvm->arch.mmu.pgt; + if (!pgt) + return -EINVAL; + + next = __stage2_range_addr_end(addr, end, chunk_size); + ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); + if (ret) + break; + } while (addr = next, addr != end); + + return ret; +} + +static bool memslot_is_logging(struct kvm_memory_slot *memslot) +{ + return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); +} + +/** + * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries + */ +int kvm_arch_flush_remote_tlbs(struct kvm *kvm) +{ + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); + return 0; +} + +int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, + gfn_t gfn, u64 nr_pages) +{ + u64 size = nr_pages << PAGE_SHIFT; + u64 addr = gfn << PAGE_SHIFT; + + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); + return 0; +} + +static void *stage2_memcache_zalloc_page(void *arg) +{ + struct kvm_mmu_memory_cache *mc = arg; + void *virt; + + /* Allocated with __GFP_ZERO, so no need to zero */ + virt = kvm_mmu_memory_cache_alloc(mc); + if (virt) + kvm_account_pgtable_pages(virt, 1); + return virt; +} + +static void *kvm_host_zalloc_pages_exact(size_t size) +{ + return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); +} + +static void *kvm_s2_zalloc_pages_exact(size_t size) +{ + void *virt = kvm_host_zalloc_pages_exact(size); + + if (virt) + kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); + return virt; +} + +static void kvm_s2_free_pages_exact(void *virt, size_t size) +{ + kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); + free_pages_exact(virt, size); +} + +static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; + +static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) +{ + struct page *page = container_of(head, struct page, rcu_head); + void *pgtable = page_to_virt(page); + s8 level = page_private(page); + + KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); +} + +static void stage2_free_unlinked_table(void *addr, s8 level) +{ + struct page *page = virt_to_page(addr); + + set_page_private(page, (unsigned long)level); + call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); +} + +static void kvm_host_get_page(void *addr) +{ + get_page(virt_to_page(addr)); +} + +static void kvm_host_put_page(void *addr) +{ + put_page(virt_to_page(addr)); +} + +static void kvm_s2_put_page(void *addr) +{ + struct page *p = virt_to_page(addr); + /* Dropping last refcount, the page will be freed */ + if (page_count(p) == 1) + kvm_account_pgtable_pages(addr, -1); + put_page(p); +} + +static int kvm_host_page_count(void *addr) +{ + return page_count(virt_to_page(addr)); +} + +static phys_addr_t kvm_host_pa(void *addr) +{ + return __pa(addr); +} + +static void *kvm_host_va(phys_addr_t phys) +{ + return __va(phys); +} + +static void clean_dcache_guest_page(void *va, size_t size) +{ + __clean_dcache_guest_page(va, size); +} + +static void invalidate_icache_guest_page(void *va, size_t size) +{ + __invalidate_icache_guest_page(va, size); +} + +/* + * Unmapping vs dcache management: + * + * If a guest maps certain memory pages as uncached, all writes will + * bypass the data cache and go directly to RAM. However, the CPUs + * can still speculate reads (not writes) and fill cache lines with + * data. + * + * Those cache lines will be *clean* cache lines though, so a + * clean+invalidate operation is equivalent to an invalidate + * operation, because no cache lines are marked dirty. + * + * Those clean cache lines could be filled prior to an uncached write + * by the guest, and the cache coherent IO subsystem would therefore + * end up writing old data to disk. + * + * This is why right after unmapping a page/section and invalidating + * the corresponding TLBs, we flush to make sure the IO subsystem will + * never hit in the cache. + * + * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as + * we then fully enforce cacheability of RAM, no matter what the guest + * does. + */ +/** + * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * @mmu: The KVM stage-2 MMU pointer + * @start: The intermediate physical base address of the range to unmap + * @size: The size of the area to unmap + * @may_block: Whether or not we are permitted to block + * + * Clear a range of stage-2 mappings, lowering the various ref-counts. Must + * be called while holding mmu_lock (unless for freeing the stage2 pgd before + * destroying the VM), otherwise another faulting VCPU may come in and mess + * with things behind our backs. + */ +static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, + bool may_block) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); + phys_addr_t end = start + size; + + lockdep_assert_held_write(&kvm->mmu_lock); + WARN_ON(size & ~PAGE_MASK); + WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), + may_block)); +} + +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block) +{ + __unmap_stage2_range(mmu, start, size, may_block); +} + +void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) +{ + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); +} + +static void stage2_flush_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t end = addr + PAGE_SIZE * memslot->npages; + + kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); +} + +/** + * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 + * @kvm: The struct kvm pointer + * + * Go through the stage 2 page tables and invalidate any cache lines + * backing memory already mapped to the VM. + */ +static void stage2_flush_vm(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int idx, bkt; + + idx = srcu_read_lock(&kvm->srcu); + write_lock(&kvm->mmu_lock); + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, bkt, slots) + stage2_flush_memslot(kvm, memslot); + + kvm_nested_s2_flush(kvm); + + write_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); +} + +/** + * free_hyp_pgds - free Hyp-mode page tables + */ +void __init free_hyp_pgds(void) +{ + mutex_lock(&kvm_hyp_pgd_mutex); + if (hyp_pgtable) { + kvm_pgtable_hyp_destroy(hyp_pgtable); + kfree(hyp_pgtable); + hyp_pgtable = NULL; + } + mutex_unlock(&kvm_hyp_pgd_mutex); +} + +static bool kvm_host_owns_hyp_mappings(void) +{ + if (is_kernel_in_hyp_mode()) + return false; + + if (static_branch_likely(&kvm_protected_mode_initialized)) + return false; + + /* + * This can happen at boot time when __create_hyp_mappings() is called + * after the hyp protection has been enabled, but the static key has + * not been flipped yet. + */ + if (!hyp_pgtable && is_protected_kvm_enabled()) + return false; + + WARN_ON(!hyp_pgtable); + + return true; +} + +int __create_hyp_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) +{ + int err; + + if (WARN_ON(!kvm_host_owns_hyp_mappings())) + return -EINVAL; + + mutex_lock(&kvm_hyp_pgd_mutex); + err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); + mutex_unlock(&kvm_hyp_pgd_mutex); + + return err; +} + +static phys_addr_t kvm_kaddr_to_phys(void *kaddr) +{ + if (!is_vmalloc_addr(kaddr)) { + BUG_ON(!virt_addr_valid(kaddr)); + return __pa(kaddr); + } else { + return page_to_phys(vmalloc_to_page(kaddr)) + + offset_in_page(kaddr); + } +} + +struct hyp_shared_pfn { + u64 pfn; + int count; + struct rb_node node; +}; + +static DEFINE_MUTEX(hyp_shared_pfns_lock); +static struct rb_root hyp_shared_pfns = RB_ROOT; + +static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, + struct rb_node **parent) +{ + struct hyp_shared_pfn *this; + + *node = &hyp_shared_pfns.rb_node; + *parent = NULL; + while (**node) { + this = container_of(**node, struct hyp_shared_pfn, node); + *parent = **node; + if (this->pfn < pfn) + *node = &((**node)->rb_left); + else if (this->pfn > pfn) + *node = &((**node)->rb_right); + else + return this; + } + + return NULL; +} + +static int share_pfn_hyp(u64 pfn) +{ + struct rb_node **node, *parent; + struct hyp_shared_pfn *this; + int ret = 0; + + mutex_lock(&hyp_shared_pfns_lock); + this = find_shared_pfn(pfn, &node, &parent); + if (this) { + this->count++; + goto unlock; + } + + this = kzalloc(sizeof(*this), GFP_KERNEL); + if (!this) { + ret = -ENOMEM; + goto unlock; + } + + this->pfn = pfn; + this->count = 1; + rb_link_node(&this->node, parent, node); + rb_insert_color(&this->node, &hyp_shared_pfns); + ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); +unlock: + mutex_unlock(&hyp_shared_pfns_lock); + + return ret; +} + +static int unshare_pfn_hyp(u64 pfn) +{ + struct rb_node **node, *parent; + struct hyp_shared_pfn *this; + int ret = 0; + + mutex_lock(&hyp_shared_pfns_lock); + this = find_shared_pfn(pfn, &node, &parent); + if (WARN_ON(!this)) { + ret = -ENOENT; + goto unlock; + } + + this->count--; + if (this->count) + goto unlock; + + rb_erase(&this->node, &hyp_shared_pfns); + kfree(this); + ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); +unlock: + mutex_unlock(&hyp_shared_pfns_lock); + + return ret; +} + +int kvm_share_hyp(void *from, void *to) +{ + phys_addr_t start, end, cur; + u64 pfn; + int ret; + + if (is_kernel_in_hyp_mode()) + return 0; + + /* + * The share hcall maps things in the 'fixed-offset' region of the hyp + * VA space, so we can only share physically contiguous data-structures + * for now. + */ + if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) + return -EINVAL; + + if (kvm_host_owns_hyp_mappings()) + return create_hyp_mappings(from, to, PAGE_HYP); + + start = ALIGN_DOWN(__pa(from), PAGE_SIZE); + end = PAGE_ALIGN(__pa(to)); + for (cur = start; cur < end; cur += PAGE_SIZE) { + pfn = __phys_to_pfn(cur); + ret = share_pfn_hyp(pfn); + if (ret) + return ret; + } + + return 0; +} + +void kvm_unshare_hyp(void *from, void *to) +{ + phys_addr_t start, end, cur; + u64 pfn; + + if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) + return; + + start = ALIGN_DOWN(__pa(from), PAGE_SIZE); + end = PAGE_ALIGN(__pa(to)); + for (cur = start; cur < end; cur += PAGE_SIZE) { + pfn = __phys_to_pfn(cur); + WARN_ON(unshare_pfn_hyp(pfn)); + } +} + +/** + * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode + * @from: The virtual kernel start address of the range + * @to: The virtual kernel end address of the range (exclusive) + * @prot: The protection to be applied to this range + * + * The same virtual address as the kernel virtual address is also used + * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying + * physical pages. + */ +int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +{ + phys_addr_t phys_addr; + unsigned long virt_addr; + unsigned long start = kern_hyp_va((unsigned long)from); + unsigned long end = kern_hyp_va((unsigned long)to); + + if (is_kernel_in_hyp_mode()) + return 0; + + if (!kvm_host_owns_hyp_mappings()) + return -EPERM; + + start = start & PAGE_MASK; + end = PAGE_ALIGN(end); + + for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { + int err; + + phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); + err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, + prot); + if (err) + return err; + } + + return 0; +} + +static int __hyp_alloc_private_va_range(unsigned long base) +{ + lockdep_assert_held(&kvm_hyp_pgd_mutex); + + if (!PAGE_ALIGNED(base)) + return -EINVAL; + + /* + * Verify that BIT(VA_BITS - 1) hasn't been flipped by + * allocating the new area, as it would indicate we've + * overflowed the idmap/IO address range. + */ + if ((base ^ io_map_base) & BIT(VA_BITS - 1)) + return -ENOMEM; + + io_map_base = base; + + return 0; +} + +/** + * hyp_alloc_private_va_range - Allocates a private VA range. + * @size: The size of the VA range to reserve. + * @haddr: The hypervisor virtual start address of the allocation. + * + * The private virtual address (VA) range is allocated below io_map_base + * and aligned based on the order of @size. + * + * Return: 0 on success or negative error code on failure. + */ +int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) +{ + unsigned long base; + int ret = 0; + + mutex_lock(&kvm_hyp_pgd_mutex); + + /* + * This assumes that we have enough space below the idmap + * page to allocate our VAs. If not, the check in + * __hyp_alloc_private_va_range() will kick. A potential + * alternative would be to detect that overflow and switch + * to an allocation above the idmap. + * + * The allocated size is always a multiple of PAGE_SIZE. + */ + size = PAGE_ALIGN(size); + base = io_map_base - size; + ret = __hyp_alloc_private_va_range(base); + + mutex_unlock(&kvm_hyp_pgd_mutex); + + if (!ret) + *haddr = base; + + return ret; +} + +static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, + unsigned long *haddr, + enum kvm_pgtable_prot prot) +{ + unsigned long addr; + int ret = 0; + + if (!kvm_host_owns_hyp_mappings()) { + addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, + phys_addr, size, prot); + if (IS_ERR_VALUE(addr)) + return addr; + *haddr = addr; + + return 0; + } + + size = PAGE_ALIGN(size + offset_in_page(phys_addr)); + ret = hyp_alloc_private_va_range(size, &addr); + if (ret) + return ret; + + ret = __create_hyp_mappings(addr, size, phys_addr, prot); + if (ret) + return ret; + + *haddr = addr + offset_in_page(phys_addr); + return ret; +} + +int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) +{ + unsigned long base; + size_t size; + int ret; + + mutex_lock(&kvm_hyp_pgd_mutex); + /* + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies + * an alignment of our allocation on the order of the size. + */ + size = NVHE_STACK_SIZE * 2; + base = ALIGN_DOWN(io_map_base - size, size); + + ret = __hyp_alloc_private_va_range(base); + + mutex_unlock(&kvm_hyp_pgd_mutex); + + if (ret) { + kvm_err("Cannot allocate hyp stack guard page\n"); + return ret; + } + + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. + */ + ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, + phys_addr, PAGE_HYP); + if (ret) + kvm_err("Cannot map hyp stack\n"); + + *haddr = base + size; + + return ret; +} + +/** + * create_hyp_io_mappings - Map IO into both kernel and HYP + * @phys_addr: The physical start address which gets mapped + * @size: Size of the region being mapped + * @kaddr: Kernel VA for this mapping + * @haddr: HYP VA for this mapping + */ +int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, + void __iomem **kaddr, + void __iomem **haddr) +{ + unsigned long addr; + int ret; + + if (is_protected_kvm_enabled()) + return -EPERM; + + *kaddr = ioremap(phys_addr, size); + if (!*kaddr) + return -ENOMEM; + + if (is_kernel_in_hyp_mode()) { + *haddr = *kaddr; + return 0; + } + + ret = __create_hyp_private_mapping(phys_addr, size, + &addr, PAGE_HYP_DEVICE); + if (ret) { + iounmap(*kaddr); + *kaddr = NULL; + *haddr = NULL; + return ret; + } + + *haddr = (void __iomem *)addr; + return 0; +} + +/** + * create_hyp_exec_mappings - Map an executable range into HYP + * @phys_addr: The physical start address which gets mapped + * @size: Size of the region being mapped + * @haddr: HYP VA for this mapping + */ +int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, + void **haddr) +{ + unsigned long addr; + int ret; + + BUG_ON(is_kernel_in_hyp_mode()); + + ret = __create_hyp_private_mapping(phys_addr, size, + &addr, PAGE_HYP_EXEC); + if (ret) { + *haddr = NULL; + return ret; + } + + *haddr = (void *)addr; + return 0; +} + +static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { + /* We shouldn't need any other callback to walk the PT */ + .phys_to_virt = kvm_host_va, +}; + +static int get_user_mapping_size(struct kvm *kvm, u64 addr) +{ + struct kvm_pgtable pgt = { + .pgd = (kvm_pteref_t)kvm->mm->pgd, + .ia_bits = vabits_actual, + .start_level = (KVM_PGTABLE_LAST_LEVEL - + ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), + .mm_ops = &kvm_user_mm_ops, + }; + unsigned long flags; + kvm_pte_t pte = 0; /* Keep GCC quiet... */ + s8 level = S8_MAX; + int ret; + + /* + * Disable IRQs so that we hazard against a concurrent + * teardown of the userspace page tables (which relies on + * IPI-ing threads). + */ + local_irq_save(flags); + ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); + local_irq_restore(flags); + + if (ret) + return ret; + + /* + * Not seeing an error, but not updating level? Something went + * deeply wrong... + */ + if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) + return -EFAULT; + if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) + return -EFAULT; + + /* Oops, the userspace PTs are gone... Replay the fault */ + if (!kvm_pte_valid(pte)) + return -EAGAIN; + + return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); +} + +static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { + .zalloc_page = stage2_memcache_zalloc_page, + .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, + .free_pages_exact = kvm_s2_free_pages_exact, + .free_unlinked_table = stage2_free_unlinked_table, + .get_page = kvm_host_get_page, + .put_page = kvm_s2_put_page, + .page_count = kvm_host_page_count, + .phys_to_virt = kvm_host_va, + .virt_to_phys = kvm_host_pa, + .dcache_clean_inval_poc = clean_dcache_guest_page, + .icache_inval_pou = invalidate_icache_guest_page, +}; + +static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) +{ + u32 kvm_ipa_limit = get_kvm_ipa_limit(); + u64 mmfr0, mmfr1; + u32 phys_shift; + + if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + return -EINVAL; + + phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); + if (is_protected_kvm_enabled()) { + phys_shift = kvm_ipa_limit; + } else if (phys_shift) { + if (phys_shift > kvm_ipa_limit || + phys_shift < ARM64_MIN_PARANGE_BITS) + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; + if (phys_shift > kvm_ipa_limit) { + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", + current->comm); + return -EINVAL; + } + } + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); + + return 0; +} + +/* + * Assume that @pgt is valid and unlinked from the KVM MMU to free the + * page-table without taking the kvm_mmu_lock and without performing any + * TLB invalidations. + * + * Also, the range of addresses can be large enough to cause need_resched + * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke + * cond_resched() periodically to prevent hogging the CPU for a long time + * and schedule something else, if required. + */ +static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, + phys_addr_t end) +{ + u64 next; + + do { + next = stage2_range_addr_end(addr, end); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, + next - addr); + if (next != end) + cond_resched(); + } while (addr = next, addr != end); +} + +static void kvm_stage2_destroy(struct kvm_pgtable *pgt) +{ + unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); + + stage2_destroy_range(pgt, 0, BIT(ia_bits)); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); +} + +/** + * kvm_init_stage2_mmu - Initialise a S2 MMU structure + * @kvm: The pointer to the KVM structure + * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine + * + * Allocates only the stage-2 HW PGD level table(s). + * Note we don't need locking here as this is only called in two cases: + * + * - when the VM is created, which can't race against anything + * + * - when secondary kvm_s2_mmu structures are initialised for NV + * guests, and the caller must hold kvm->lock as this is called on a + * per-vcpu basis. + */ +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +{ + int cpu, err; + struct kvm_pgtable *pgt; + + /* + * If we already have our page tables in place, and that the + * MMU context is the canonical one, we have a bug somewhere, + * as this is only supposed to ever happen once per VM. + * + * Otherwise, we're building nested page tables, and that's + * probably because userspace called KVM_ARM_VCPU_INIT more + * than once on the same vcpu. Since that's actually legal, + * don't kick a fuss and leave gracefully. + */ + if (mmu->pgt != NULL) { + if (kvm_is_nested_s2_mmu(kvm, mmu)) + return 0; + + kvm_err("kvm_arch already initialized?\n"); + return -EINVAL; + } + + err = kvm_init_ipa_range(mmu, type); + if (err) + return err; + + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); + if (!pgt) + return -ENOMEM; + + mmu->arch = &kvm->arch; + err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); + if (err) + goto out_free_pgtable; + + mmu->pgt = pgt; + if (is_protected_kvm_enabled()) + return 0; + + mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); + if (!mmu->last_vcpu_ran) { + err = -ENOMEM; + goto out_destroy_pgtable; + } + + for_each_possible_cpu(cpu) + *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; + + /* The eager page splitting is disabled by default */ + mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + mmu->split_page_cache.gfp_zero = __GFP_ZERO; + + mmu->pgd_phys = __pa(pgt->pgd); + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + + return 0; + +out_destroy_pgtable: + kvm_stage2_destroy(pgt); +out_free_pgtable: + kfree(pgt); + return err; +} + +void kvm_uninit_stage2_mmu(struct kvm *kvm) +{ + kvm_free_stage2_pgd(&kvm->arch.mmu); + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); +} + +static void stage2_unmap_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + hva_t hva = memslot->userspace_addr; + phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t size = PAGE_SIZE * memslot->npages; + hva_t reg_end = hva + size; + + /* + * A memory region could potentially cover multiple VMAs, and any holes + * between them, so iterate over all of them to find out if we should + * unmap any of them. + * + * +--------------------------------------------+ + * +---------------+----------------+ +----------------+ + * | : VMA 1 | VMA 2 | | VMA 3 : | + * +---------------+----------------+ +----------------+ + * | memory region | + * +--------------------------------------------+ + */ + do { + struct vm_area_struct *vma; + hva_t vm_start, vm_end; + + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) + break; + + /* + * Take the intersection of this VMA with the memory region + */ + vm_start = max(hva, vma->vm_start); + vm_end = min(reg_end, vma->vm_end); + + if (!(vma->vm_flags & VM_PFNMAP)) { + gpa_t gpa = addr + (vm_start - memslot->userspace_addr); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); + } + hva = vm_end; + } while (hva < reg_end); +} + +/** + * stage2_unmap_vm - Unmap Stage-2 RAM mappings + * @kvm: The struct kvm pointer + * + * Go through the memregions and unmap any regular RAM + * backing memory already mapped to the VM. + */ +void stage2_unmap_vm(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int idx, bkt; + + idx = srcu_read_lock(&kvm->srcu); + mmap_read_lock(current->mm); + write_lock(&kvm->mmu_lock); + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, bkt, slots) + stage2_unmap_memslot(kvm, memslot); + + kvm_nested_s2_unmap(kvm, true); + + write_unlock(&kvm->mmu_lock); + mmap_read_unlock(current->mm); + srcu_read_unlock(&kvm->srcu, idx); +} + +void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); + struct kvm_pgtable *pgt = NULL; + + write_lock(&kvm->mmu_lock); + pgt = mmu->pgt; + if (pgt) { + mmu->pgd_phys = 0; + mmu->pgt = NULL; + free_percpu(mmu->last_vcpu_ran); + } + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + + write_unlock(&kvm->mmu_lock); + + if (pgt) { + kvm_stage2_destroy(pgt); + kfree(pgt); + } +} + +static void hyp_mc_free_fn(void *addr, void *mc) +{ + struct kvm_hyp_memcache *memcache = mc; + + if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, -1); + + free_page((unsigned long)addr); +} + +static void *hyp_mc_alloc_fn(void *mc) +{ + struct kvm_hyp_memcache *memcache = mc; + void *addr; + + addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, 1); + + return addr; +} + +void free_hyp_memcache(struct kvm_hyp_memcache *mc) +{ + if (!is_protected_kvm_enabled()) + return; + + kfree(mc->mapping); + __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); +} + +int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) +{ + if (!is_protected_kvm_enabled()) + return 0; + + if (!mc->mapping) { + mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); + if (!mc->mapping) + return -ENOMEM; + } + + return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, + kvm_host_pa, mc); +} + +/** + * kvm_phys_addr_ioremap - map a device range to guest IPA + * + * @kvm: The KVM pointer + * @guest_ipa: The IPA at which to insert the mapping + * @pa: The physical address of the device + * @size: The size of the mapping + * @writable: Whether or not to create a writable mapping + */ +int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, + phys_addr_t pa, unsigned long size, bool writable) +{ + phys_addr_t addr; + int ret = 0; + struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct kvm_pgtable *pgt = mmu->pgt; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | + KVM_PGTABLE_PROT_R | + (writable ? KVM_PGTABLE_PROT_W : 0); + + if (is_protected_kvm_enabled()) + return -EPERM; + + size += offset_in_page(guest_ipa); + guest_ipa &= PAGE_MASK; + + for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { + ret = kvm_mmu_topup_memory_cache(&cache, + kvm_mmu_cache_min_pages(mmu)); + if (ret) + break; + + write_lock(&kvm->mmu_lock); + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, + pa, prot, &cache, 0); + write_unlock(&kvm->mmu_lock); + if (ret) + break; + + pa += PAGE_SIZE; + } + + kvm_mmu_free_memory_cache(&cache); + return ret; +} + +/** + * kvm_stage2_wp_range() - write protect stage2 memory region range + * @mmu: The KVM stage-2 MMU pointer + * @addr: Start address of range + * @end: End address of range + */ +void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) +{ + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); +} + +/** + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to write protect + * + * Called to start logging dirty pages after memory region + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns + * all present PUD, PMD and PTEs are write protected in the memory region. + * Afterwards read of dirty page log can be called. + * + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); + phys_addr_t start, end; + + if (WARN_ON_ONCE(!memslot)) + return; + + start = memslot->base_gfn << PAGE_SHIFT; + end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + write_lock(&kvm->mmu_lock); + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_nested_s2_wp(kvm); + write_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs_memslot(kvm, memslot); +} + +/** + * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE + * pages for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to split + * + * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + phys_addr_t start, end; + + lockdep_assert_held(&kvm->slots_lock); + + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, slot); + + start = memslot->base_gfn << PAGE_SHIFT; + end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + write_lock(&kvm->mmu_lock); + kvm_mmu_split_huge_pages(kvm, start, end); + write_unlock(&kvm->mmu_lock); +} + +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of pages at offset 'gfn_offset' in this memory + * slot to enable dirty logging on + * + * Writes protect selected pages to enable dirty logging, and then + * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + lockdep_assert_held_write(&kvm->mmu_lock); + + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); + + /* + * Eager-splitting is done when manual-protect is set. We + * also check for initially-all-set because we can avoid + * eager-splitting if initially-all-set is false. + * Initially-all-set equal false implies that huge-pages were + * already split when enabling dirty logging: no need to do it + * again. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_split_huge_pages(kvm, start, end); + + kvm_nested_s2_wp(kvm); +} + +static void kvm_send_hwpoison_signal(unsigned long address, short lsb) +{ + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); +} + +static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, + unsigned long hva, + unsigned long map_size) +{ + gpa_t gpa_start; + hva_t uaddr_start, uaddr_end; + size_t size; + + /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ + if (map_size == PAGE_SIZE) + return true; + + /* pKVM only supports PMD_SIZE huge-mappings */ + if (is_protected_kvm_enabled() && map_size != PMD_SIZE) + return false; + + size = memslot->npages * PAGE_SIZE; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 + * PMD/PUD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SHIFT: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 block | Stage-2 block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva & ~(map_size - 1)) >= uaddr_start && + (hva & ~(map_size - 1)) + map_size <= uaddr_end; +} + +/* + * Check if the given hva is backed by a transparent huge page (THP) and + * whether it can be mapped using block mapping in stage2. If so, adjust + * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently + * supported. This will need to be updated to support other THP sizes. + * + * Returns the size of the mapping. + */ +static long +transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long hva, kvm_pfn_t *pfnp, + phys_addr_t *ipap) +{ + kvm_pfn_t pfn = *pfnp; + + /* + * Make sure the adjustment is done only for THP pages. Also make + * sure that the HVA and IPA are sufficiently aligned and that the + * block map is contained within the memslot. + */ + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + int sz = get_user_mapping_size(kvm, hva); + + if (sz < 0) + return sz; + + if (sz < PMD_SIZE) + return PAGE_SIZE; + + *ipap &= PMD_MASK; + pfn &= ~(PTRS_PER_PMD - 1); + *pfnp = pfn; + + return PMD_SIZE; + } + + /* Use page mapping if we cannot use block mapping. */ + return PAGE_SIZE; +} + +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) +{ + unsigned long pa; + + if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) + return huge_page_shift(hstate_vma(vma)); + + if (!(vma->vm_flags & VM_PFNMAP)) + return PAGE_SHIFT; + + VM_BUG_ON(is_vm_hugetlb_page(vma)); + + pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); + +#ifndef __PAGETABLE_PMD_FOLDED + if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && + ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && + ALIGN(hva, PUD_SIZE) <= vma->vm_end) + return PUD_SHIFT; +#endif + + if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && + ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && + ALIGN(hva, PMD_SIZE) <= vma->vm_end) + return PMD_SHIFT; + + return PAGE_SHIFT; +} + +/* + * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be + * able to see the page's tags and therefore they must be initialised first. If + * PG_mte_tagged is set, tags have already been initialised. + * + * Must be called with kvm->mmu_lock held to ensure the memory remains mapped + * while the tags are zeroed. + */ +static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, + unsigned long size) +{ + unsigned long i, nr_pages = size >> PAGE_SHIFT; + struct page *page = pfn_to_page(pfn); + struct folio *folio = page_folio(page); + + if (!kvm_has_mte(kvm)) + return; + + if (folio_test_hugetlb(folio)) { + /* Hugetlb has MTE flags set on head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr_pages; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + return; + } + + for (i = 0; i < nr_pages; i++, page++) { + if (try_page_mte_tagging(page)) { + mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + } +} + +static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_MTE_ALLOWED; +} + +static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) +{ + switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { + case MT_NORMAL_NC: + case MT_DEVICE_nGnRnE: + case MT_DEVICE_nGnRE: + return false; + default: + return true; + } +} + +static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, + void **memcache) +{ + int min_pages; + + if (!is_protected_kvm_enabled()) + *memcache = &vcpu->arch.mmu_page_cache; + else + *memcache = &vcpu->arch.pkvm_memcache; + + if (!topup_memcache) + return 0; + + min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); + + if (!is_protected_kvm_enabled()) + return kvm_mmu_topup_memory_cache(*memcache, min_pages); + + return topup_hyp_memcache(*memcache, min_pages); +} + +/* + * Potentially reduce shadow S2 permissions to match the guest's own S2. For + * exec faults, we'd only reach this point if the guest actually allowed it (see + * kvm_s2_handle_perm_fault). + * + * Also encode the level of the original translation in the SW bits of the leaf + * entry as a proxy for the span of that translation. This will be retrieved on + * TLB invalidation from the guest and used to limit the invalidation scope if a + * TTL hint or a range isn't provided. + */ +static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, + enum kvm_pgtable_prot *prot, + bool *writable) +{ + *writable &= kvm_s2_trans_writable(nested); + if (!kvm_s2_trans_readable(nested)) + *prot &= ~KVM_PGTABLE_PROT_R; + + *prot |= kvm_encode_nested_level(nested); +} + +static void adjust_nested_exec_perms(struct kvm *kvm, + struct kvm_s2_trans *nested, + enum kvm_pgtable_prot *prot) +{ + if (!kvm_s2_trans_exec_el0(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_UX; + if (!kvm_s2_trans_exec_el1(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_PX; +} + +#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED) + +static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, + struct kvm_memory_slot *memslot, bool is_perm) +{ + bool write_fault, exec_fault, writable; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; + unsigned long mmu_seq; + struct page *page; + struct kvm *kvm = vcpu->kvm; + void *memcache; + kvm_pfn_t pfn; + gfn_t gfn; + int ret; + + ret = prepare_mmu_memcache(vcpu, true, &memcache); + if (ret) + return ret; + + if (nested) + gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; + else + gfn = fault_ipa >> PAGE_SHIFT; + + write_fault = kvm_is_write_fault(vcpu); + exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); + + VM_WARN_ON_ONCE(write_fault && exec_fault); + + mmu_seq = kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); + if (ret) { + kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, + write_fault, exec_fault, false); + return ret; + } + + writable = !(memslot->flags & KVM_MEM_READONLY); + + if (nested) + adjust_nested_fault_perms(nested, &prot, &writable); + + if (writable) + prot |= KVM_PGTABLE_PROT_W; + + if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) + prot |= KVM_PGTABLE_PROT_X; + + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); + + kvm_fault_lock(kvm); + if (mmu_invalidate_retry(kvm, mmu_seq)) { + ret = -EAGAIN; + goto out_unlock; + } + + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, + __pfn_to_phys(pfn), prot, + memcache, flags); + +out_unlock: + kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_fault_unlock(kvm); + + if (writable && !ret) + mark_page_dirty_in_slot(kvm, memslot, gfn); + + return ret != -EAGAIN ? ret : 0; +} + +static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, + struct kvm_memory_slot *memslot, unsigned long hva, + bool fault_is_perm) +{ + int ret = 0; + bool topup_memcache; + bool write_fault, writable; + bool exec_fault, mte_allowed, is_vma_cacheable; + bool s2_force_noncacheable = false, vfio_allow_any_uc = false; + unsigned long mmu_seq; + phys_addr_t ipa = fault_ipa; + struct kvm *kvm = vcpu->kvm; + struct vm_area_struct *vma; + short vma_shift; + void *memcache; + gfn_t gfn; + kvm_pfn_t pfn; + bool logging_active = memslot_is_logging(memslot); + bool force_pte = logging_active; + long vma_pagesize, fault_granule; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; + struct page *page; + vm_flags_t vm_flags; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; + + if (fault_is_perm) + fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); + write_fault = kvm_is_write_fault(vcpu); + exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); + VM_WARN_ON_ONCE(write_fault && exec_fault); + + /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + topup_memcache = !fault_is_perm || (logging_active && write_fault); + ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); + if (ret) + return ret; + + /* + * Let's check if we will get back a huge page backed by hugetlbfs, or + * get block mapping for device MMIO region. + */ + mmap_read_lock(current->mm); + vma = vma_lookup(current->mm, hva); + if (unlikely(!vma)) { + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); + mmap_read_unlock(current->mm); + return -EFAULT; + } + + if (force_pte) + vma_shift = PAGE_SHIFT; + else + vma_shift = get_vma_page_shift(vma, hva); + + switch (vma_shift) { +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SHIFT: + if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) + break; + fallthrough; +#endif + case CONT_PMD_SHIFT: + vma_shift = PMD_SHIFT; + fallthrough; + case PMD_SHIFT: + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) + break; + fallthrough; + case CONT_PTE_SHIFT: + vma_shift = PAGE_SHIFT; + force_pte = true; + fallthrough; + case PAGE_SHIFT: + break; + default: + WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); + } + + vma_pagesize = 1UL << vma_shift; + + if (nested) { + unsigned long max_map_size; + + max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; + + ipa = kvm_s2_trans_output(nested); + + /* + * If we're about to create a shadow stage 2 entry, then we + * can only create a block mapping if the guest stage 2 page + * table uses at least as big a mapping. + */ + max_map_size = min(kvm_s2_trans_size(nested), max_map_size); + + /* + * Be careful that if the mapping size falls between + * two host sizes, take the smallest of the two. + */ + if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) + max_map_size = PMD_SIZE; + else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) + max_map_size = PAGE_SIZE; + + force_pte = (max_map_size == PAGE_SIZE); + vma_pagesize = min_t(long, vma_pagesize, max_map_size); + } + + /* + * Both the canonical IPA and fault IPA must be hugepage-aligned to + * ensure we find the right PFN and lay down the mapping in the right + * place. + */ + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { + fault_ipa &= ~(vma_pagesize - 1); + ipa &= ~(vma_pagesize - 1); + } + + gfn = ipa >> PAGE_SHIFT; + mte_allowed = kvm_vma_mte_allowed(vma); + + vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; + + vm_flags = vma->vm_flags; + + is_vma_cacheable = kvm_vma_is_cacheable(vma); + + /* Don't use the VMA after the unlock -- it may have vanished */ + vma = NULL; + + /* + * Read mmu_invalidate_seq so that KVM can detect if the results of + * vma_lookup() or __kvm_faultin_pfn() become stale prior to + * acquiring kvm->mmu_lock. + * + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). + */ + mmu_seq = kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); + + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); + if (pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(hva, vma_shift); + return 0; + } + if (is_error_noslot_pfn(pfn)) + return -EFAULT; + + /* + * Check if this is non-struct page memory PFN, and cannot support + * CMOs. It could potentially be unsafe to access as cacheable. + */ + if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { + if (is_vma_cacheable) { + /* + * Whilst the VMA owner expects cacheable mapping to this + * PFN, hardware also has to support the FWB and CACHE DIC + * features. + * + * ARM64 KVM relies on kernel VA mapping to the PFN to + * perform cache maintenance as the CMO instructions work on + * virtual addresses. VM_PFNMAP region are not necessarily + * mapped to a KVA and hence the presence of hardware features + * S2FWB and CACHE DIC are mandatory to avoid the need for + * cache maintenance. + */ + if (!kvm_supports_cacheable_pfnmap()) + ret = -EFAULT; + } else { + /* + * If the page was identified as device early by looking at + * the VMA flags, vma_pagesize is already representing the + * largest quantity we can map. If instead it was mapped + * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE + * and must not be upgraded. + * + * In both cases, we don't let transparent_hugepage_adjust() + * change things at the last minute. + */ + s2_force_noncacheable = true; + } + } else if (logging_active && !write_fault) { + /* + * Only actually map the page as writable if this was a write + * fault. + */ + writable = false; + } + + if (exec_fault && s2_force_noncacheable) + ret = -ENOEXEC; + + if (ret) { + kvm_release_page_unused(page); + return ret; + } + + if (nested) + adjust_nested_fault_perms(nested, &prot, &writable); + + kvm_fault_lock(kvm); + pgt = vcpu->arch.hw_mmu->pgt; + if (mmu_invalidate_retry(kvm, mmu_seq)) { + ret = -EAGAIN; + goto out_unlock; + } + + /* + * If we are not forced to use page mapping, check if we are + * backed by a THP and thus use block mapping if possible. + */ + if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { + if (fault_is_perm && fault_granule > PAGE_SIZE) + vma_pagesize = fault_granule; + else + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, + hva, &pfn, + &fault_ipa); + + if (vma_pagesize < 0) { + ret = vma_pagesize; + goto out_unlock; + } + } + + if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { + /* Check the VMM hasn't introduced a new disallowed VMA */ + if (mte_allowed) { + sanitise_mte_tags(kvm, pfn, vma_pagesize); + } else { + ret = -EFAULT; + goto out_unlock; + } + } + + if (writable) + prot |= KVM_PGTABLE_PROT_W; + + if (exec_fault) + prot |= KVM_PGTABLE_PROT_X; + + if (s2_force_noncacheable) { + if (vfio_allow_any_uc) + prot |= KVM_PGTABLE_PROT_NORMAL_NC; + else + prot |= KVM_PGTABLE_PROT_DEVICE; + } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { + prot |= KVM_PGTABLE_PROT_X; + } + + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); + + /* + * Under the premise of getting a FSC_PERM fault, we just need to relax + * permissions only if vma_pagesize equals fault_granule. Otherwise, + * kvm_pgtable_stage2_map() should be called to change block size. + */ + if (fault_is_perm && vma_pagesize == fault_granule) { + /* + * Drop the SW bits in favour of those stored in the + * PTE, which will be preserved. + */ + prot &= ~KVM_NV_GUEST_MAP_SZ; + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); + } else { + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, + __pfn_to_phys(pfn), prot, + memcache, flags); + } + +out_unlock: + kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_fault_unlock(kvm); + + /* Mark the page dirty only if the fault is handled successfully */ + if (writable && !ret) + mark_page_dirty_in_slot(kvm, memslot, gfn); + + return ret != -EAGAIN ? ret : 0; +} + +/* Resolve the access fault by making the page young again. */ +static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) +{ + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; + struct kvm_s2_mmu *mmu; + + trace_kvm_access_fault(fault_ipa); + + read_lock(&vcpu->kvm->mmu_lock); + mmu = vcpu->arch.hw_mmu; + KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); + read_unlock(&vcpu->kvm->mmu_lock); +} + +/* + * Returns true if the SEA should be handled locally within KVM if the abort + * is caused by a kernel memory allocation (e.g. stage-2 table memory). + */ +static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) +{ + /* + * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort + * taken from a guest EL to EL2 is due to a host-imposed access (e.g. + * stage-2 PTW). + */ + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return true; + + /* KVM owns the VNCR when the vCPU isn't in a nested context. */ + if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) + return true; + + /* + * Determining if an external abort during a table walk happened at + * stage-2 is only possible with S1PTW is set. Otherwise, since KVM + * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the + * PA of the stage-1 descriptor) can reach here and are reported + * with a TTW ESR value. + */ + return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); +} + +int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_run *run = vcpu->run; + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 esr_mask = ESR_ELx_EC_MASK | + ESR_ELx_IL | + ESR_ELx_FnV | + ESR_ELx_EA | + ESR_ELx_CM | + ESR_ELx_WNR | + ESR_ELx_FSC; + u64 ipa; + + /* + * Give APEI the opportunity to claim the abort before handling it + * within KVM. apei_claim_sea() expects to be called with IRQs enabled. + */ + lockdep_assert_irqs_enabled(); + if (apei_claim_sea(NULL) == 0) + return 1; + + if (host_owns_sea(vcpu, esr) || + !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) + return kvm_inject_serror(vcpu); + + /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ + if (kvm_has_ras(kvm)) + esr_mask |= ESR_ELx_SET_MASK; + + /* + * Exit to userspace, and provide faulting guest virtual and physical + * addresses in case userspace wants to emulate SEA to guest by + * writing to FAR_ELx and HPFAR_ELx registers. + */ + memset(&run->arm_sea, 0, sizeof(run->arm_sea)); + run->exit_reason = KVM_EXIT_ARM_SEA; + run->arm_sea.esr = esr & esr_mask; + + if (!(esr & ESR_ELx_FnV)) + run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); + + ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (ipa != INVALID_GPA) { + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; + run->arm_sea.gpa = ipa; + } + + return 0; +} + +/** + * kvm_handle_guest_abort - handles all 2nd stage aborts + * @vcpu: the VCPU pointer + * + * Any abort that gets to the host is almost guaranteed to be caused by a + * missing second stage translation table entry, which can mean that either the + * guest simply needs more memory and we must allocate an appropriate page or it + * can mean that the guest tried to access I/O memory, which is emulated by user + * space. The distinction is based on the IPA causing the fault and whether this + * memory region has been registered as standard RAM by user space. + */ +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) +{ + struct kvm_s2_trans nested_trans, *nested = NULL; + unsigned long esr; + phys_addr_t fault_ipa; /* The address we faulted on */ + phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ + struct kvm_memory_slot *memslot; + unsigned long hva; + bool is_iabt, write_fault, writable; + gfn_t gfn; + int ret, idx; + + if (kvm_vcpu_abt_issea(vcpu)) + return kvm_handle_guest_sea(vcpu); + + esr = kvm_vcpu_get_esr(vcpu); + + /* + * The fault IPA should be reliable at this point as we're not dealing + * with an SEA. + */ + ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) + return -EFAULT; + + is_iabt = kvm_vcpu_trap_is_iabt(vcpu); + + if (esr_fsc_is_translation_fault(esr)) { + /* Beyond sanitised PARange (which is the IPA limit) */ + if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { + kvm_inject_size_fault(vcpu); + return 1; + } + + /* Falls between the IPA range and the PARange? */ + if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { + fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + + return kvm_inject_sea(vcpu, is_iabt, fault_ipa); + } + } + + trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), + kvm_vcpu_get_hfar(vcpu), fault_ipa); + + /* Check the stage-2 fault is trans. fault or write fault */ + if (!esr_fsc_is_translation_fault(esr) && + !esr_fsc_is_permission_fault(esr) && + !esr_fsc_is_access_flag_fault(esr)) { + kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", + kvm_vcpu_trap_get_class(vcpu), + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), + (unsigned long)kvm_vcpu_get_esr(vcpu)); + return -EFAULT; + } + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + /* + * We may have faulted on a shadow stage 2 page table if we are + * running a nested guest. In this case, we have to resolve the L2 + * IPA to the L1 IPA first, before knowing what kind of memory should + * back the L1 IPA. + * + * If the shadow stage 2 page table walk faults, then we simply inject + * this to the guest and carry on. + * + * If there are no shadow S2 PTs because S2 is disabled, there is + * nothing to walk and we treat it as a 1:1 before going through the + * canonical translation. + */ + if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && + vcpu->arch.hw_mmu->nested_stage2_enabled) { + u32 esr; + + ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); + if (ret == -EAGAIN) { + ret = 1; + goto out_unlock; + } + + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ipa = kvm_s2_trans_output(&nested_trans); + nested = &nested_trans; + } + + gfn = ipa >> PAGE_SHIFT; + memslot = gfn_to_memslot(vcpu->kvm, gfn); + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); + write_fault = kvm_is_write_fault(vcpu); + if (kvm_is_error_hva(hva) || (write_fault && !writable)) { + /* + * The guest has put either its instructions or its page-tables + * somewhere it shouldn't have. Userspace won't be able to do + * anything about this (there's no syndrome for a start), so + * re-inject the abort back into the guest. + */ + if (is_iabt) { + ret = -ENOEXEC; + goto out; + } + + if (kvm_vcpu_abt_iss1tw(vcpu)) { + ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + goto out_unlock; + } + + /* + * Check for a cache maintenance operation. Since we + * ended-up here, we know it is outside of any memory + * slot. But we can't find out if that is for a device, + * or if the guest is just being stupid. The only thing + * we know for sure is that this range cannot be cached. + * + * So let's assume that the guest is just being + * cautious, and skip the instruction. + */ + if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { + kvm_incr_pc(vcpu); + ret = 1; + goto out_unlock; + } + + /* + * The IPA is reported as [MAX:12], so we need to + * complement it with the bottom 12 bits from the + * faulting VA. This is always 12 bits, irrespective + * of the page size. + */ + ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + ret = io_mem_abort(vcpu, ipa); + goto out_unlock; + } + + /* Userspace should not be able to register out-of-bounds IPAs */ + VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); + + if (esr_fsc_is_access_flag_fault(esr)) { + handle_access_fault(vcpu, fault_ipa); + ret = 1; + goto out_unlock; + } + + VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && + !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); + + if (kvm_slot_has_gmem(memslot)) + ret = gmem_abort(vcpu, fault_ipa, nested, memslot, + esr_fsc_is_permission_fault(esr)); + else + ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, + esr_fsc_is_permission_fault(esr)); + if (ret == 0) + ret = 1; +out: + if (ret == -ENOEXEC) + ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); +out_unlock: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} + +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + if (!kvm->arch.mmu.pgt) + return false; + + __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, + (range->end - range->start) << PAGE_SHIFT, + range->may_block); + + kvm_nested_s2_unmap(kvm, range->may_block); + return false; +} + +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + u64 size = (range->end - range->start) << PAGE_SHIFT; + + if (!kvm->arch.mmu.pgt) + return false; + + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT, + size, true); + /* + * TODO: Handle nested_mmu structures here using the reverse mapping in + * a later version of patch series. + */ +} + +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + u64 size = (range->end - range->start) << PAGE_SHIFT; + + if (!kvm->arch.mmu.pgt) + return false; + + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT, + size, false); +} + +phys_addr_t kvm_mmu_get_httbr(void) +{ + return __pa(hyp_pgtable->pgd); +} + +phys_addr_t kvm_get_idmap_vector(void) +{ + return hyp_idmap_vector; +} + +static int kvm_map_idmap_text(void) +{ + unsigned long size = hyp_idmap_end - hyp_idmap_start; + int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, + PAGE_HYP_EXEC); + if (err) + kvm_err("Failed to idmap %lx-%lx\n", + hyp_idmap_start, hyp_idmap_end); + + return err; +} + +static void *kvm_hyp_zalloc_page(void *arg) +{ + return (void *)get_zeroed_page(GFP_KERNEL); +} + +static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { + .zalloc_page = kvm_hyp_zalloc_page, + .get_page = kvm_host_get_page, + .put_page = kvm_host_put_page, + .phys_to_virt = kvm_host_va, + .virt_to_phys = kvm_host_pa, +}; + +int __init kvm_mmu_init(u32 *hyp_va_bits) +{ + int err; + u32 idmap_bits; + u32 kernel_bits; + + hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); + hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); + hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); + hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); + hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); + + /* + * We rely on the linker script to ensure at build time that the HYP + * init code does not cross a page boundary. + */ + BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); + + /* + * The ID map is always configured for 48 bits of translation, which + * may be fewer than the number of VA bits used by the regular kernel + * stage 1, when VA_BITS=52. + * + * At EL2, there is only one TTBR register, and we can't switch between + * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom + * line: we need to use the extended range with *both* our translation + * tables. + * + * So use the maximum of the idmap VA bits and the regular kernel stage + * 1 VA bits to assure that the hypervisor can both ID map its code page + * and map any kernel memory. + */ + idmap_bits = IDMAP_VA_BITS; + kernel_bits = vabits_actual; + *hyp_va_bits = max(idmap_bits, kernel_bits); + + kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); + kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); + kvm_debug("HYP VA range: %lx:%lx\n", + kern_hyp_va(PAGE_OFFSET), + kern_hyp_va((unsigned long)high_memory - 1)); + + if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && + hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && + hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { + /* + * The idmap page is intersecting with the VA space, + * it is not safe to continue further. + */ + kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); + err = -EINVAL; + goto out; + } + + hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); + if (!hyp_pgtable) { + kvm_err("Hyp mode page-table not allocated\n"); + err = -ENOMEM; + goto out; + } + + err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); + if (err) + goto out_free_pgtable; + + err = kvm_map_idmap_text(); + if (err) + goto out_destroy_pgtable; + + io_map_base = hyp_idmap_start; + __hyp_va_bits = *hyp_va_bits; + return 0; + +out_destroy_pgtable: + kvm_pgtable_hyp_destroy(hyp_pgtable); +out_free_pgtable: + kfree(hyp_pgtable); + hyp_pgtable = NULL; +out: + return err; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; + + /* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be tracked while the + * memory slot is write protected. + */ + if (log_dirty_pages) { + + if (change == KVM_MR_DELETE) + return; + + /* + * Huge and normal pages are write-protected and split + * on either of these two cases: + * + * 1. with initial-all-set: gradually with CLEAR ioctls, + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + return; + /* + * or + * 2. without initial-all-set: all in one shot when + * enabling dirty logging. + */ + kvm_mmu_wp_memory_region(kvm, new->id); + kvm_mmu_split_memory_region(kvm, new->id); + } else { + /* + * Free any leftovers from the eager page splitting cache. Do + * this when deleting, moving, disabling dirty logging, or + * creating the memslot (a nop). Doing it for deletes makes + * sure we don't leak memory, and there's no need to keep the + * cache around for any of the other cases. + */ + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); + } +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + hva_t hva, reg_end; + int ret = 0; + + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && + change != KVM_MR_FLAGS_ONLY) + return 0; + + /* + * Prevent userspace from creating a memory region outside of the IPA + * space addressable by the KVM guest IPA space. + */ + if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) + return -EFAULT; + + /* + * Only support guest_memfd backed memslots with mappable memory, since + * there aren't any CoCo VMs that support only private memory on arm64. + */ + if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) + return -EINVAL; + + hva = new->userspace_addr; + reg_end = hva + (new->npages << PAGE_SHIFT); + + mmap_read_lock(current->mm); + /* + * A memory region could potentially cover multiple VMAs, and any holes + * between them, so iterate over all of them. + * + * +--------------------------------------------+ + * +---------------+----------------+ +----------------+ + * | : VMA 1 | VMA 2 | | VMA 3 : | + * +---------------+----------------+ +----------------+ + * | memory region | + * +--------------------------------------------+ + */ + do { + struct vm_area_struct *vma; + + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) + break; + + if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { + ret = -EINVAL; + break; + } + + if (vma->vm_flags & VM_PFNMAP) { + /* IO region dirty page logging not allowed */ + if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + ret = -EINVAL; + break; + } + + /* + * Cacheable PFNMAP is allowed only if the hardware + * supports it. + */ + if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { + ret = -EINVAL; + break; + } + } + hva = min(reg_end, vma->vm_end); + } while (hva < reg_end); + + mmap_read_unlock(current->mm); + return ret; +} + +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ +} + +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) +{ +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + gpa_t gpa = slot->base_gfn << PAGE_SHIFT; + phys_addr_t size = slot->npages << PAGE_SHIFT; + + write_lock(&kvm->mmu_lock); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); + kvm_nested_s2_unmap(kvm, true); + write_unlock(&kvm->mmu_lock); +} + +/* + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). + * + * Main problems: + * - S/W ops are local to a CPU (not broadcast) + * - We have line migration behind our back (speculation) + * - System caches don't support S/W at all (damn!) + * + * In the face of the above, the best we can do is to try and convert + * S/W ops to VA ops. Because the guest is not allowed to infer the + * S/W to PA mapping, it can only use S/W to nuke the whole cache, + * which is a rather good thing for us. + * + * Also, it is only used when turning caches on/off ("The expected + * usage of the cache maintenance instructions that operate by set/way + * is associated with the cache maintenance instructions associated + * with the powerdown and powerup of caches, if this is required by + * the implementation."). + * + * We use the following policy: + * + * - If we trap a S/W operation, we enable VM trapping to detect + * caches being turned on/off, and do a full clean. + * + * - We flush the caches on both caches being turned on and off. + * + * - Once the caches are enabled, we stop trapping VM ops. + */ +void kvm_set_way_flush(struct kvm_vcpu *vcpu) +{ + unsigned long hcr = *vcpu_hcr(vcpu); + + /* + * If this is the first time we do a S/W operation + * (i.e. HCR_TVM not set) flush the whole memory, and set the + * VM trapping. + * + * Otherwise, rely on the VM trapping to wait for the MMU + + * Caches to be turned off. At that point, we'll be able to + * clean the caches again. + */ + if (!(hcr & HCR_TVM)) { + trace_kvm_set_way_flush(*vcpu_pc(vcpu), + vcpu_has_cache_enabled(vcpu)); + stage2_flush_vm(vcpu->kvm); + *vcpu_hcr(vcpu) = hcr | HCR_TVM; + } +} + +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) +{ + bool now_enabled = vcpu_has_cache_enabled(vcpu); + + /* + * If switching the MMU+caches on, need to invalidate the caches. + * If switching it off, need to clean the caches. + * Clean + invalidate does the trick always. + */ + if (now_enabled != was_enabled) + stage2_flush_vm(vcpu->kvm); + + /* Caches are now on, stop trapping VM ops (until a S/W op) */ + if (now_enabled) + *vcpu_hcr(vcpu) &= ~HCR_TVM; + + trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); +} diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c new file mode 100644 index 000000000000..cdeeb8f09e72 --- /dev/null +++ b/arch/arm64/kvm/nested.c @@ -0,0 +1,1919 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Columbia University and Linaro Ltd. + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/bitfield.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/fixmap.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> +#include <asm/sysreg.h> + +#include "sys_regs.h" + +struct vncr_tlb { + /* The guest's VNCR_EL2 */ + u64 gva; + struct s1_walk_info wi; + struct s1_walk_result wr; + + u64 hpa; + + /* -1 when not mapped on a CPU */ + int cpu; + + /* + * true if the TLB is valid. Can only be changed with the + * mmu_lock held. + */ + bool valid; +}; + +/* + * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between + * memory usage and potential number of different sets of S2 PTs in + * the guests. Running out of S2 MMUs only affects performance (we + * will invalidate them more often). + */ +#define S2_MMU_PER_VCPU 2 + +void kvm_init_nested(struct kvm *kvm) +{ + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; + atomic_set(&kvm->arch.vncr_map_count, 0); +} + +static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) +{ + /* + * We only initialise the IPA range on the canonical MMU, which + * defines the contract between KVM and userspace on where the + * "hardware" is in the IPA space. This affects the validity of MMIO + * exits forwarded to userspace, for example. + * + * For nested S2s, we use the PARange as exposed to the guest, as it + * is allowed to use it at will to expose whatever memory map it + * wants to its own guests as it would be on real HW. + */ + return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm)); +} + +int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_s2_mmu *tmp; + int num_mmus, ret = 0; + + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) && + !cpus_have_final_cap(ARM64_HAS_HCR_NV1)) + return -EINVAL; + + if (!vcpu->arch.ctxt.vncr_array) + vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT | + __GFP_ZERO); + + if (!vcpu->arch.ctxt.vncr_array) + return -ENOMEM; + + /* + * Let's treat memory allocation failures as benign: If we fail to + * allocate anything, return an error and keep the allocated array + * alive. Userspace may try to recover by initializing the vcpu + * again, and there is no reason to affect the whole VM for this. + */ + num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU; + tmp = kvrealloc(kvm->arch.nested_mmus, + size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!tmp) + return -ENOMEM; + + swap(kvm->arch.nested_mmus, tmp); + + /* + * If we went through a realocation, adjust the MMU back-pointers in + * the previously initialised kvm_pgtable structures. + */ + if (kvm->arch.nested_mmus != tmp) + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) + kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; + + for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) + ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); + + if (ret) { + for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) + kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); + + free_page((unsigned long)vcpu->arch.ctxt.vncr_array); + vcpu->arch.ctxt.vncr_array = NULL; + + return ret; + } + + kvm->arch.nested_mmus_size = num_mmus; + + return 0; +} + +struct s2_walk_info { + u64 baddr; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int sl; + unsigned int t0sz; + bool be; + bool ha; +}; + +static u32 compute_fsc(int level, u32 fsc) +{ + return fsc | (level & 0x3); +} + +static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc) +{ + u32 esr; + + esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC; + esr |= compute_fsc(level, fsc); + return esr; +} + +static int get_ia_size(struct s2_walk_info *wi) +{ + return 64 - wi->t0sz; +} + +static int check_base_s2_limits(struct s2_walk_info *wi, + int level, int input_size, int stride) +{ + int start_size, ia_size; + + ia_size = get_ia_size(wi); + + /* Check translation limits */ + switch (BIT(wi->pgshift)) { + case SZ_64K: + if (level == 0 || (level == 1 && ia_size <= 42)) + return -EFAULT; + break; + case SZ_16K: + if (level == 0 || (level == 1 && ia_size <= 40)) + return -EFAULT; + break; + case SZ_4K: + if (level < 0 || (level == 0 && ia_size <= 42)) + return -EFAULT; + break; + } + + /* Check input size limits */ + if (input_size > ia_size) + return -EFAULT; + + /* Check number of entries in starting level table */ + start_size = input_size - ((3 - level) * stride + wi->pgshift); + if (start_size < 1 || start_size > stride + 4) + return -EFAULT; + + return 0; +} + +/* Check if output is within boundaries */ +static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) +{ + unsigned int output_size = wi->max_oa_bits; + + if (output_size != 48 && (output & GENMASK_ULL(47, output_size))) + return -1; + + return 0; +} + +static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc, + struct s2_walk_info *wi) +{ + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + /* + * Handle reversedescriptors if endianness differs between the + * host and the guest hypervisor. + */ + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; +} + +static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new, + struct s2_walk_info *wi) +{ + if (wi->be) { + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); + } else { + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + +/* + * This is essentially a C-version of the pseudo code from the ARM ARM + * AArch64.TranslationTableWalk function. I strongly recommend looking at + * that pseudocode in trying to understand this. + * + * Must be called with the kvm->srcu read lock held + */ +static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, + struct s2_walk_info *wi, struct kvm_s2_trans *out) +{ + int first_block_level, level, stride, input_size, base_lower_bound; + phys_addr_t base_addr; + unsigned int addr_top, addr_bottom; + u64 desc, new_desc; /* page table entry */ + int ret; + phys_addr_t paddr; + + switch (BIT(wi->pgshift)) { + default: + case SZ_64K: + case SZ_16K: + level = 3 - wi->sl; + first_block_level = 2; + break; + case SZ_4K: + level = 2 - wi->sl; + first_block_level = 1; + break; + } + + stride = wi->pgshift - 3; + input_size = get_ia_size(wi); + if (input_size > 48 || input_size < 25) + return -EFAULT; + + ret = check_base_s2_limits(wi, level, input_size, stride); + if (WARN_ON(ret)) + return ret; + + base_lower_bound = 3 + input_size - ((3 - level) * stride + + wi->pgshift); + base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound); + + if (check_output_size(wi, base_addr)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + return 1; + } + + addr_top = input_size - 1; + + while (1) { + phys_addr_t index; + + addr_bottom = (3 - level) * stride + wi->pgshift; + index = (ipa & GENMASK_ULL(addr_top, addr_bottom)) + >> (addr_bottom - 3); + + paddr = base_addr | index; + ret = read_guest_s2_desc(vcpu, paddr, &desc, wi); + if (ret < 0) + return ret; + + new_desc = desc; + + /* Check for valid descriptor at this point */ + if (!(desc & KVM_PTE_VALID)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) { + if (level < 3) + break; + + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + /* We're at the final level */ + if (level == 3) + break; + + if (check_output_size(wi, desc)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + out->desc = desc; + return 1; + } + + base_addr = desc & GENMASK_ULL(47, wi->pgshift); + + level += 1; + addr_top = addr_bottom - 1; + } + + if (level < first_block_level) { + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + if (check_output_size(wi, desc)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + out->desc = desc; + return 1; + } + + if (wi->ha) + new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + + if (new_desc != desc) { + ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + + if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); + out->desc = desc; + return 1; + } + + addr_bottom += contiguous_bit_shift(desc, wi, level); + + /* Calculate and return the result */ + paddr = (desc & GENMASK_ULL(47, addr_bottom)) | + (ipa & GENMASK_ULL(addr_bottom - 1, 0)); + out->output = paddr; + out->block_size = 1UL << ((3 - level) * stride + wi->pgshift); + out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + out->level = level; + out->desc = desc; + return 0; +} + +static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) +{ + wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK; + + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + wi->pgshift = 12; break; + case VTCR_EL2_TG0_16K: + wi->pgshift = 14; break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + + wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + /* Global limit for now, should eventually be per-VM */ + wi->max_oa_bits = min(get_kvm_ipa_limit(), + ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); + + wi->ha = vtcr & VTCR_EL2_HA; +} + +int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, + struct kvm_s2_trans *result) +{ + u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + struct s2_walk_info wi; + int ret; + + result->esr = 0; + + if (!vcpu_has_nv(vcpu)) + return 0; + + wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + vtcr_to_walk_info(vtcr, &wi); + + wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; + + ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result); + if (ret) + result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC); + + return ret; +} + +static unsigned int ttl_to_size(u8 ttl) +{ + int level = ttl & 3; + int gran = (ttl >> 2) & 3; + unsigned int max_size = 0; + + switch (gran) { + case TLBI_TTL_TG_4K: + switch (level) { + case 0: + break; + case 1: + max_size = SZ_1G; + break; + case 2: + max_size = SZ_2M; + break; + case 3: + max_size = SZ_4K; + break; + } + break; + case TLBI_TTL_TG_16K: + switch (level) { + case 0: + case 1: + break; + case 2: + max_size = SZ_32M; + break; + case 3: + max_size = SZ_16K; + break; + } + break; + case TLBI_TTL_TG_64K: + switch (level) { + case 0: + case 1: + /* No 52bit IPA support */ + break; + case 2: + max_size = SZ_512M; + break; + case 3: + max_size = SZ_64K; + break; + } + break; + default: /* No size information */ + break; + } + + return max_size; +} + +static u8 pgshift_level_to_ttl(u16 shift, u8 level) +{ + u8 ttl; + + switch(shift) { + case 12: + ttl = TLBI_TTL_TG_4K; + break; + case 14: + ttl = TLBI_TTL_TG_16K; + break; + case 16: + ttl = TLBI_TTL_TG_64K; + break; + default: + BUG(); + } + + ttl <<= 2; + ttl |= level & 3; + + return ttl; +} + +/* + * Compute the equivalent of the TTL field by parsing the shadow PT. The + * granule size is extracted from the cached VTCR_EL2.TG0 while the level is + * retrieved from first entry carrying the level as a tag. + */ +static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr) +{ + u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr; + kvm_pte_t pte; + u8 ttl, level; + + lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock); + + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + ttl = (TLBI_TTL_TG_4K << 2); + break; + case VTCR_EL2_TG0_16K: + ttl = (TLBI_TTL_TG_16K << 2); + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + ttl = (TLBI_TTL_TG_64K << 2); + break; + } + + tmp = addr; + +again: + /* Iteratively compute the block sizes for a particular granule size */ + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + if (sz < SZ_4K) sz = SZ_4K; + else if (sz < SZ_2M) sz = SZ_2M; + else if (sz < SZ_1G) sz = SZ_1G; + else sz = 0; + break; + case VTCR_EL2_TG0_16K: + if (sz < SZ_16K) sz = SZ_16K; + else if (sz < SZ_32M) sz = SZ_32M; + else sz = 0; + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + if (sz < SZ_64K) sz = SZ_64K; + else if (sz < SZ_512M) sz = SZ_512M; + else sz = 0; + break; + } + + if (sz == 0) + return 0; + + tmp &= ~(sz - 1); + if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL)) + goto again; + if (!(pte & PTE_VALID)) + goto again; + level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte); + if (!level) + goto again; + + ttl |= level; + + /* + * We now have found some level information in the shadow S2. Check + * that the resulting range is actually including the original IPA. + */ + sz = ttl_to_size(ttl); + if (addr < (tmp + sz)) + return ttl; + + return 0; +} + +unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); + unsigned long max_size; + u8 ttl; + + ttl = FIELD_GET(TLBI_TTL_MASK, val); + + if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) { + /* No TTL, check the shadow S2 for a hint */ + u64 addr = (val & GENMASK_ULL(35, 0)) << 12; + ttl = get_guest_mapping_ttl(mmu, addr); + } + + max_size = ttl_to_size(ttl); + + if (!max_size) { + /* Compute the maximum extent of the invalidation */ + switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + max_size = SZ_1G; + break; + case VTCR_EL2_TG0_16K: + max_size = SZ_32M; + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + /* + * No, we do not support 52bit IPA in nested yet. Once + * we do, this should be 4TB. + */ + max_size = SZ_512M; + break; + } + } + + WARN_ON(!max_size); + return max_size; +} + +/* + * We can have multiple *different* MMU contexts with the same VMID: + * + * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit + * + * - Multiple vcpus using private S2s (huh huh...), hence differing by the + * VBBTR_EL2.BADDR address + * + * - A combination of the above... + * + * We can always identify which MMU context to pick at run-time. However, + * TLB invalidation involving a VMID must take action on all the TLBs using + * this particular VMID. This translates into applying the same invalidation + * operation to all the contexts that are using this VMID. Moar phun! + */ +void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, + const union tlbi_info *info, + void (*tlbi_callback)(struct kvm_s2_mmu *, + const union tlbi_info *)) +{ + write_lock(&kvm->mmu_lock); + + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!kvm_s2_mmu_valid(mmu)) + continue; + + if (vmid == get_vmid(mmu->tlb_vttbr)) + tlbi_callback(mmu, info); + } + + write_unlock(&kvm->mmu_lock); +} + +struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + bool nested_stage2_enabled; + u64 vttbr, vtcr, hcr; + + lockdep_assert_held_write(&kvm->mmu_lock); + + vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + hcr = vcpu_read_sys_reg(vcpu, HCR_EL2); + + nested_stage2_enabled = hcr & HCR_VM; + + /* Don't consider the CnP bit for the vttbr match */ + vttbr &= ~VTTBR_CNP_BIT; + + /* + * Two possibilities when looking up a S2 MMU context: + * + * - either S2 is enabled in the guest, and we need a context that is + * S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR, + * which makes it safe from a TLB conflict perspective (a broken + * guest won't be able to generate them), + * + * - or S2 is disabled, and we need a context that is S2-disabled + * and matches the VMID only, as all TLBs are tagged by VMID even + * if S2 translation is disabled. + */ + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!kvm_s2_mmu_valid(mmu)) + continue; + + if (nested_stage2_enabled && + mmu->nested_stage2_enabled && + vttbr == mmu->tlb_vttbr && + vtcr == mmu->tlb_vtcr) + return mmu; + + if (!nested_stage2_enabled && + !mmu->nested_stage2_enabled && + get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr)) + return mmu; + } + return NULL; +} + +static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_s2_mmu *s2_mmu; + int i; + + lockdep_assert_held_write(&vcpu->kvm->mmu_lock); + + s2_mmu = lookup_s2_mmu(vcpu); + if (s2_mmu) + goto out; + + /* + * Make sure we don't always search from the same point, or we + * will always reuse a potentially active context, leaving + * free contexts unused. + */ + for (i = kvm->arch.nested_mmus_next; + i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next); + i++) { + s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size]; + + if (atomic_read(&s2_mmu->refcnt) == 0) + break; + } + BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */ + + /* Set the scene for the next search */ + kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; + + /* Make sure we don't forget to do the laundry */ + if (kvm_s2_mmu_valid(s2_mmu)) + s2_mmu->pending_unmap = true; + + /* + * The virtual VMID (modulo CnP) will be used as a key when matching + * an existing kvm_s2_mmu. + * + * We cache VTCR at allocation time, once and for all. It'd be great + * if the guest didn't screw that one up, as this is not very + * forgiving... + */ + s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT; + s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM; + +out: + atomic_inc(&s2_mmu->refcnt); + + /* + * Set the vCPU request to perform an unmap, even if the pending unmap + * originates from another vCPU. This guarantees that the MMU has been + * completely unmapped before any vCPU actually uses it, and allows + * multiple vCPUs to lend a hand with completing the unmap. + */ + if (s2_mmu->pending_unmap) + kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu); + + return s2_mmu; +} + +void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) +{ + /* CnP being set denotes an invalid entry */ + mmu->tlb_vttbr = VTTBR_CNP_BIT; + mmu->nested_stage2_enabled = false; + atomic_set(&mmu->refcnt, 0); +} + +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) +{ + /* + * If the vCPU kept its reference on the MMU after the last put, + * keep rolling with it. + */ + if (is_hyp_ctxt(vcpu)) { + if (!vcpu->arch.hw_mmu) + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + } else { + if (!vcpu->arch.hw_mmu) { + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) + vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); + } + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) + kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu); + } +} + +void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) +{ + /* Unconditionally drop the VNCR mapping if we have one */ + if (host_data_test_flag(L1_VNCR_MAPPED)) { + BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id()); + BUG_ON(is_hyp_ctxt(vcpu)); + + clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu)); + vcpu->arch.vncr_tlb->cpu = -1; + host_data_clear_flag(L1_VNCR_MAPPED); + atomic_dec(&vcpu->kvm->arch.vncr_map_count); + } + + /* + * Keep a reference on the associated stage-2 MMU if the vCPU is + * scheduling out and not in WFI emulation, suggesting it is likely to + * reuse the MMU sometime soon. + */ + if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI)) + return; + + if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) + atomic_dec(&vcpu->arch.hw_mmu->refcnt); + + vcpu->arch.hw_mmu = NULL; +} + +/* + * Returns non-zero if permission fault is handled by injecting it to the next + * level hypervisor. + */ +int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans) +{ + bool forward_fault = false; + + trans->esr = 0; + + if (!kvm_vcpu_trap_is_permission_fault(vcpu)) + return 0; + + if (kvm_vcpu_trap_is_iabt(vcpu)) { + if (vcpu_mode_priv(vcpu)) + forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans); + else + forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans); + } else { + bool write_fault = kvm_is_write_fault(vcpu); + + forward_fault = ((write_fault && !trans->writable) || + (!write_fault && !trans->readable)); + } + + if (forward_fault) + trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM); + + return forward_fault; +} + +int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2) +{ + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2); + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2); + + return kvm_inject_nested_sync(vcpu, esr_el2); +} + +static void invalidate_vncr(struct vncr_tlb *vt) +{ + vt->valid = false; + if (vt->cpu != -1) + clear_fixmap(vncr_fixmap(vt->cpu)); +} + +static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 ipa_start, ipa_end, ipa_size; + + /* + * Careful here: We end-up here from an MMU notifier, + * and this can race against a vcpu not being onlined + * yet, without the pseudo-TLB being allocated. + * + * Skip those, as they obviously don't participate in + * the invalidation at this stage. + */ + if (!vt) + continue; + + if (!vt->valid) + continue; + + ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, + vt->wr.level)); + ipa_start = vt->wr.pa & ~(ipa_size - 1); + ipa_end = ipa_start + ipa_size; + + if (ipa_end <= start || ipa_start >= end) + continue; + + invalidate_vncr(vt); + } +} + +struct s1e2_tlbi_scope { + enum { + TLBI_ALL, + TLBI_VA, + TLBI_VAA, + TLBI_ASID, + } type; + + u16 asid; + u64 va; + u64 size; +}; + +static void invalidate_vncr_va(struct kvm *kvm, + struct s1e2_tlbi_scope *scope) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 va_start, va_end, va_size; + + if (!vt->valid) + continue; + + va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, + vt->wr.level)); + va_start = vt->gva & ~(va_size - 1); + va_end = va_start + va_size; + + switch (scope->type) { + case TLBI_ALL: + break; + + case TLBI_VA: + if (va_end <= scope->va || + va_start >= (scope->va + scope->size)) + continue; + if (vt->wr.nG && vt->wr.asid != scope->asid) + continue; + break; + + case TLBI_VAA: + if (va_end <= scope->va || + va_start >= (scope->va + scope->size)) + continue; + break; + + case TLBI_ASID: + if (!vt->wr.nG || vt->wr.asid != scope->asid) + continue; + break; + } + + invalidate_vncr(vt); + } +} + +#define tlbi_va_s1_to_va(v) (u64)sign_extend64((v) << 12, 48) + +static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val, + struct s1e2_tlbi_scope *scope) +{ + switch (inst) { + case OP_TLBI_ALLE2: + case OP_TLBI_ALLE2IS: + case OP_TLBI_ALLE2OS: + case OP_TLBI_VMALLE1: + case OP_TLBI_VMALLE1IS: + case OP_TLBI_VMALLE1OS: + case OP_TLBI_ALLE2NXS: + case OP_TLBI_ALLE2ISNXS: + case OP_TLBI_ALLE2OSNXS: + case OP_TLBI_VMALLE1NXS: + case OP_TLBI_VMALLE1ISNXS: + case OP_TLBI_VMALLE1OSNXS: + scope->type = TLBI_ALL; + break; + case OP_TLBI_VAE2: + case OP_TLBI_VAE2IS: + case OP_TLBI_VAE2OS: + case OP_TLBI_VAE1: + case OP_TLBI_VAE1IS: + case OP_TLBI_VAE1OS: + case OP_TLBI_VAE2NXS: + case OP_TLBI_VAE2ISNXS: + case OP_TLBI_VAE2OSNXS: + case OP_TLBI_VAE1NXS: + case OP_TLBI_VAE1ISNXS: + case OP_TLBI_VAE1OSNXS: + case OP_TLBI_VALE2: + case OP_TLBI_VALE2IS: + case OP_TLBI_VALE2OS: + case OP_TLBI_VALE1: + case OP_TLBI_VALE1IS: + case OP_TLBI_VALE1OS: + case OP_TLBI_VALE2NXS: + case OP_TLBI_VALE2ISNXS: + case OP_TLBI_VALE2OSNXS: + case OP_TLBI_VALE1NXS: + case OP_TLBI_VALE1ISNXS: + case OP_TLBI_VALE1OSNXS: + scope->type = TLBI_VA; + scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val)); + if (!scope->size) + scope->size = SZ_1G; + scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1); + scope->asid = FIELD_GET(TLBIR_ASID_MASK, val); + break; + case OP_TLBI_ASIDE1: + case OP_TLBI_ASIDE1IS: + case OP_TLBI_ASIDE1OS: + case OP_TLBI_ASIDE1NXS: + case OP_TLBI_ASIDE1ISNXS: + case OP_TLBI_ASIDE1OSNXS: + scope->type = TLBI_ASID; + scope->asid = FIELD_GET(TLBIR_ASID_MASK, val); + break; + case OP_TLBI_VAAE1: + case OP_TLBI_VAAE1IS: + case OP_TLBI_VAAE1OS: + case OP_TLBI_VAAE1NXS: + case OP_TLBI_VAAE1ISNXS: + case OP_TLBI_VAAE1OSNXS: + case OP_TLBI_VAALE1: + case OP_TLBI_VAALE1IS: + case OP_TLBI_VAALE1OS: + case OP_TLBI_VAALE1NXS: + case OP_TLBI_VAALE1ISNXS: + case OP_TLBI_VAALE1OSNXS: + scope->type = TLBI_VAA; + scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val)); + if (!scope->size) + scope->size = SZ_1G; + scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1); + break; + case OP_TLBI_RVAE2: + case OP_TLBI_RVAE2IS: + case OP_TLBI_RVAE2OS: + case OP_TLBI_RVAE1: + case OP_TLBI_RVAE1IS: + case OP_TLBI_RVAE1OS: + case OP_TLBI_RVAE2NXS: + case OP_TLBI_RVAE2ISNXS: + case OP_TLBI_RVAE2OSNXS: + case OP_TLBI_RVAE1NXS: + case OP_TLBI_RVAE1ISNXS: + case OP_TLBI_RVAE1OSNXS: + case OP_TLBI_RVALE2: + case OP_TLBI_RVALE2IS: + case OP_TLBI_RVALE2OS: + case OP_TLBI_RVALE1: + case OP_TLBI_RVALE1IS: + case OP_TLBI_RVALE1OS: + case OP_TLBI_RVALE2NXS: + case OP_TLBI_RVALE2ISNXS: + case OP_TLBI_RVALE2OSNXS: + case OP_TLBI_RVALE1NXS: + case OP_TLBI_RVALE1ISNXS: + case OP_TLBI_RVALE1OSNXS: + scope->type = TLBI_VA; + scope->va = decode_range_tlbi(val, &scope->size, &scope->asid); + break; + case OP_TLBI_RVAAE1: + case OP_TLBI_RVAAE1IS: + case OP_TLBI_RVAAE1OS: + case OP_TLBI_RVAAE1NXS: + case OP_TLBI_RVAAE1ISNXS: + case OP_TLBI_RVAAE1OSNXS: + case OP_TLBI_RVAALE1: + case OP_TLBI_RVAALE1IS: + case OP_TLBI_RVAALE1OS: + case OP_TLBI_RVAALE1NXS: + case OP_TLBI_RVAALE1ISNXS: + case OP_TLBI_RVAALE1OSNXS: + scope->type = TLBI_VAA; + scope->va = decode_range_tlbi(val, &scope->size, NULL); + break; + } +} + +void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val) +{ + struct s1e2_tlbi_scope scope = {}; + + compute_s1_tlbi_range(vcpu, inst, val, &scope); + + guard(write_lock)(&vcpu->kvm->mmu_lock); + invalidate_vncr_va(vcpu->kvm, &scope); +} + +void kvm_nested_s2_wp(struct kvm *kvm) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu)); + } + + kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits)); +} + +void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block); + } + + kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits)); +} + +void kvm_nested_s2_flush(struct kvm *kvm) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu)); + } +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!WARN_ON(atomic_read(&mmu->refcnt))) + kvm_free_stage2_pgd(mmu); + } + kvfree(kvm->arch.nested_mmus); + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; + kvm_uninit_stage2_mmu(kvm); +} + +/* + * Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter: + * + * - We introduce an internal representation of a vcpu-private TLB, + * representing the mapping between the guest VA contained in VNCR_EL2, + * the IPA the guest's EL2 PTs point to, and the actual PA this lives at. + * + * - On translation fault from a nested VNCR access, we create such a TLB. + * If there is no mapping to describe, the guest inherits the fault. + * Crucially, no actual mapping is done at this stage. + * + * - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above + * TLB exists, we map it in the fixmap for this CPU, and run with it. We + * have to respect the permissions dictated by the guest, but not the + * memory type (FWB is a must). + * + * - Note that we usually don't do a vcpu_load() on the back of a fault + * (unless we are preempted), so the resolution of a translation fault + * must go via a request that will map the VNCR page in the fixmap. + * vcpu_load() might as well use the same mechanism. + * + * - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was + * mapped, we unmap it. Yes it is that simple. The TLB still exists + * though, and may be reused at a later load. + * + * - On permission fault, we simply forward the fault to the guest's EL2. + * Get out of my way. + * + * - On any TLBI for the EL2&0 translation regime, we must find any TLB that + * intersects with the TLBI request, invalidate it, and unmap the page + * from the fixmap. Because we need to look at all the vcpu-private TLBs, + * this requires some wide-ranging locking to ensure that nothing races + * against it. This may require some refcounting to avoid the search when + * no such TLB is present. + * + * - On MMU notifiers, we must invalidate our TLB in a similar way, but + * looking at the IPA instead. The funny part is that there may not be a + * stage-2 mapping for this page if L1 hasn't accessed it using LD/ST + * instructions. + */ + +int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu) +{ + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return 0; + + vcpu->arch.vncr_tlb = kzalloc(sizeof(*vcpu->arch.vncr_tlb), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.vncr_tlb) + return -ENOMEM; + + return 0; +} + +static u64 read_vncr_el2(struct kvm_vcpu *vcpu) +{ + return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48); +} + +static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem) +{ + struct kvm_memory_slot *memslot; + bool write_fault, writable; + unsigned long mmu_seq; + struct vncr_tlb *vt; + struct page *page; + u64 va, pfn, gfn; + int ret; + + vt = vcpu->arch.vncr_tlb; + + /* + * If we're about to walk the EL2 S1 PTs, we must invalidate the + * current TLB, as it could be sampled from another vcpu doing a + * TLBI *IS. A real CPU wouldn't do that, but we only keep a single + * translation, so not much of a choice. + * + * We also prepare the next walk wilst we're at it. + */ + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + invalidate_vncr(vt); + + vt->wi = (struct s1_walk_info) { + .regime = TR_EL20, + .as_el0 = false, + .pan = false, + }; + vt->wr = (struct s1_walk_result){}; + } + + guard(srcu)(&vcpu->kvm->srcu); + + va = read_vncr_el2(vcpu); + + ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va); + if (ret) + return ret; + + write_fault = kvm_is_write_fault(vcpu); + + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + gfn = vt->wr.pa >> PAGE_SHIFT; + memslot = gfn_to_memslot(vcpu->kvm, gfn); + if (!memslot) + return -EFAULT; + + *is_gmem = kvm_slot_has_gmem(memslot); + if (!*is_gmem) { + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); + if (is_error_noslot_pfn(pfn) || (write_fault && !writable)) + return -EFAULT; + } else { + ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL); + if (ret) { + kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE, + write_fault, false, false); + return ret; + } + } + + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) + return -EAGAIN; + + vt->gva = va; + vt->hpa = pfn << PAGE_SHIFT; + vt->valid = true; + vt->cpu = -1; + + kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu); + kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw); + } + + if (vt->wr.pw) + mark_page_dirty(vcpu->kvm, gfn); + + return 0; +} + +static void inject_vncr_perm(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 esr = kvm_vcpu_get_esr(vcpu); + + /* Adjust the fault level to reflect that of the guest's */ + esr &= ~ESR_ELx_FSC; + esr |= FIELD_PREP(ESR_ELx_FSC, + ESR_ELx_FSC_PERM_L(vt->wr.level)); + + kvm_inject_nested_sync(vcpu, esr); +} + +static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + + lockdep_assert_held_read(&vcpu->kvm->mmu_lock); + + if (!vt->valid) + return false; + + if (read_vncr_el2(vcpu) != vt->gva) + return false; + + if (vt->wr.nG) { + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + u64 ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + u16 asid; + + asid = FIELD_GET(TTBR_ASID_MASK, ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + return asid == vt->wr.asid; + } + + return true; +} + +int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 esr = kvm_vcpu_get_esr(vcpu); + + WARN_ON_ONCE(!(esr & ESR_ELx_VNCR)); + + if (kvm_vcpu_abt_issea(vcpu)) + return kvm_handle_guest_sea(vcpu); + + if (esr_fsc_is_permission_fault(esr)) { + inject_vncr_perm(vcpu); + } else if (esr_fsc_is_translation_fault(esr)) { + bool valid, is_gmem = false; + int ret; + + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + valid = kvm_vncr_tlb_lookup(vcpu); + + if (!valid) + ret = kvm_translate_vncr(vcpu, &is_gmem); + else + ret = -EPERM; + + switch (ret) { + case -EAGAIN: + /* Let's try again... */ + break; + case -ENOMEM: + /* + * For guest_memfd, this indicates that it failed to + * create a folio to back the memory. Inform userspace. + */ + if (is_gmem) + return 0; + /* Otherwise, let's try again... */ + break; + case -EFAULT: + case -EIO: + case -EHWPOISON: + if (is_gmem) + return 0; + fallthrough; + case -EINVAL: + case -ENOENT: + case -EACCES: + /* + * Translation failed, inject the corresponding + * exception back to EL2. + */ + BUG_ON(!vt->wr.failed); + + esr &= ~ESR_ELx_FSC; + esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst); + + kvm_inject_nested_sync(vcpu, esr); + break; + case -EPERM: + /* Hack to deal with POE until we get kernel support */ + inject_vncr_perm(vcpu); + break; + case 0: + break; + } + } else { + WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr); + } + + return 1; +} + +static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + pgprot_t prot; + + guard(preempt)(); + guard(read_lock)(&vcpu->kvm->mmu_lock); + + /* + * The request to map VNCR may have raced against some other + * event, such as an interrupt, and may not be valid anymore. + */ + if (is_hyp_ctxt(vcpu)) + return; + + /* + * Check that the pseudo-TLB is valid and that VNCR_EL2 still + * contains the expected value. If it doesn't, we simply bail out + * without a mapping -- a transformed MSR/MRS will generate the + * fault and allows us to populate the pseudo-TLB. + */ + if (!vt->valid) + return; + + if (read_vncr_el2(vcpu) != vt->gva) + return; + + if (vt->wr.nG) { + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + u64 ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + u16 asid; + + asid = FIELD_GET(TTBR_ASID_MASK, ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + if (asid != vt->wr.asid) + return; + } + + vt->cpu = smp_processor_id(); + + if (vt->wr.pw && vt->wr.pr) + prot = PAGE_KERNEL; + else if (vt->wr.pr) + prot = PAGE_KERNEL_RO; + else + prot = PAGE_NONE; + + /* + * We can't map write-only (or no permission at all) in the kernel, + * but the guest can do it if using POE, so we'll have to turn a + * translation fault into a permission fault at runtime. + * FIXME: WO doesn't work at all, need POE support in the kernel. + */ + if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) { + __set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot); + host_data_set_flag(L1_VNCR_MAPPED); + atomic_inc(&vcpu->kvm->arch.vncr_map_count); + } +} + +#define has_tgran_2(__r, __sz) \ + ({ \ + u64 _s1, _s2, _mmfr0 = __r; \ + \ + _s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz##_2, _mmfr0); \ + \ + _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz, _mmfr0); \ + \ + ((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \ + _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \ + (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \ + _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \ + }) +/* + * Our emulated CPU doesn't support all the possible features. For the + * sake of simplicity (and probably mental sanity), wipe out a number + * of feature bits we don't intend to support for the time being. + * This list should get updated as new features get added to the NV + * support, and new extension to the architecture. + */ +u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) +{ + u64 orig_val = val; + + switch (reg) { + case SYS_ID_AA64ISAR0_EL1: + /* Support everything but TME */ + val &= ~ID_AA64ISAR0_EL1_TME; + break; + + case SYS_ID_AA64ISAR1_EL1: + /* Support everything but LS64 and Spec Invalidation */ + val &= ~(ID_AA64ISAR1_EL1_LS64 | + ID_AA64ISAR1_EL1_SPECRES); + break; + + case SYS_ID_AA64PFR0_EL1: + /* No RME, AMU, MPAM, or S-EL2 */ + val &= ~(ID_AA64PFR0_EL1_RME | + ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SEL2 | + ID_AA64PFR0_EL1_EL3 | + ID_AA64PFR0_EL1_EL2 | + ID_AA64PFR0_EL1_EL1 | + ID_AA64PFR0_EL1_EL0); + /* 64bit only at any EL */ + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP); + break; + + case SYS_ID_AA64PFR1_EL1: + /* Only support BTI, SSBS, CSV2_frac */ + val &= ~(ID_AA64PFR1_EL1_PFAR | + ID_AA64PFR1_EL1_MTEX | + ID_AA64PFR1_EL1_THE | + ID_AA64PFR1_EL1_GCS | + ID_AA64PFR1_EL1_MTE_frac | + ID_AA64PFR1_EL1_NMI | + ID_AA64PFR1_EL1_SME | + ID_AA64PFR1_EL1_RES0 | + ID_AA64PFR1_EL1_MPAM_frac | + ID_AA64PFR1_EL1_MTE); + break; + + case SYS_ID_AA64MMFR0_EL1: + /* Hide ExS, Secure Memory */ + val &= ~(ID_AA64MMFR0_EL1_EXS | + ID_AA64MMFR0_EL1_TGRAN4_2 | + ID_AA64MMFR0_EL1_TGRAN16_2 | + ID_AA64MMFR0_EL1_TGRAN64_2 | + ID_AA64MMFR0_EL1_SNSMEM); + + /* Hide CNTPOFF if present */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP); + + /* Disallow unsupported S2 page sizes */ + switch (PAGE_SIZE) { + case SZ_64K: + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI); + fallthrough; + case SZ_16K: + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI); + fallthrough; + case SZ_4K: + /* Support everything */ + break; + } + + /* + * Since we can't support a guest S2 page size smaller + * than the host's own page size (due to KVM only + * populating its own S2 using the kernel's page + * size), advertise the limitation using FEAT_GTG. + */ + switch (PAGE_SIZE) { + case SZ_4K: + if (has_tgran_2(orig_val, 4)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP); + fallthrough; + case SZ_16K: + if (has_tgran_2(orig_val, 16)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP); + fallthrough; + case SZ_64K: + if (has_tgran_2(orig_val, 64)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP); + break; + } + + /* Cap PARange to 48bits */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48); + break; + + case SYS_ID_AA64MMFR1_EL1: + val &= ~(ID_AA64MMFR1_EL1_CMOW | + ID_AA64MMFR1_EL1_nTLBPA | + ID_AA64MMFR1_EL1_ETS); + + /* FEAT_E2H0 implies no VHE */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) + val &= ~ID_AA64MMFR1_EL1_VH; + + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF); + break; + + case SYS_ID_AA64MMFR2_EL1: + val &= ~(ID_AA64MMFR2_EL1_BBM | + ID_AA64MMFR2_EL1_TTL | + GENMASK_ULL(47, 44) | + ID_AA64MMFR2_EL1_ST | + ID_AA64MMFR2_EL1_CCIDX | + ID_AA64MMFR2_EL1_VARange); + + /* Force TTL support */ + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP); + break; + + case SYS_ID_AA64MMFR4_EL1: + /* + * You get EITHER + * + * - FEAT_VHE without FEAT_E2H0 + * - FEAT_NV limited to FEAT_NV2 + * - HCR_EL2.NV1 being RES0 + * + * OR + * + * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV + * + * Life is too short for anything else. + */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) { + val = 0; + } else { + val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1); + } + break; + + case SYS_ID_AA64DFR0_EL1: + /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */ + val &= ~(ID_AA64DFR0_EL1_ExtTrcBuff | + ID_AA64DFR0_EL1_BRBE | + ID_AA64DFR0_EL1_MTPMU | + ID_AA64DFR0_EL1_TraceBuffer | + ID_AA64DFR0_EL1_TraceFilt | + ID_AA64DFR0_EL1_PMSVer | + ID_AA64DFR0_EL1_CTX_CMPs | + ID_AA64DFR0_EL1_SEBEP | + ID_AA64DFR0_EL1_PMSS | + ID_AA64DFR0_EL1_TraceVer); + + /* + * FEAT_Debugv8p9 requires support for extended breakpoints / + * watchpoints. + */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); + break; + } + + return val; +} + +u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg sr, u64 v) +{ + struct kvm_sysreg_masks *masks; + + masks = vcpu->kvm->arch.sysreg_masks; + + if (masks) { + sr -= __SANITISED_REG_START__; + + v &= ~masks->mask[sr].res0; + v |= masks->mask[sr].res1; + } + + return v; +} + +static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) +{ + int i = sr - __SANITISED_REG_START__; + + BUILD_BUG_ON(!__builtin_constant_p(sr)); + BUILD_BUG_ON(sr < __SANITISED_REG_START__); + BUILD_BUG_ON(sr >= NR_SYS_REGS); + + kvm->arch.sysreg_masks->mask[i].res0 = res0; + kvm->arch.sysreg_masks->mask[i].res1 = res1; +} + +int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 res0, res1; + + lockdep_assert_held(&kvm->arch.config_lock); + + if (kvm->arch.sysreg_masks) + goto out; + + kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), + GFP_KERNEL_ACCOUNT); + if (!kvm->arch.sysreg_masks) + return -ENOMEM; + + /* VTTBR_EL2 */ + res0 = res1 = 0; + if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16)) + res0 |= GENMASK(63, 56); + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP)) + res0 |= VTTBR_CNP_BIT; + set_sysreg_masks(kvm, VTTBR_EL2, res0, res1); + + /* VTCR_EL2 */ + res0 = GENMASK(63, 32) | GENMASK(30, 20); + res1 = BIT(31); + set_sysreg_masks(kvm, VTCR_EL2, res0, res1); + + /* VMPIDR_EL2 */ + res0 = GENMASK(63, 40) | GENMASK(30, 24); + res1 = BIT(31); + set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1); + + /* HCR_EL2 */ + get_reg_fixed_bits(kvm, HCR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HCR_EL2, res0, res1); + + /* HCRX_EL2 */ + get_reg_fixed_bits(kvm, HCRX_EL2, &res0, &res1); + set_sysreg_masks(kvm, HCRX_EL2, res0, res1); + + /* HFG[RW]TR_EL2 */ + get_reg_fixed_bits(kvm, HFGRTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGRTR_EL2, res0, res1); + get_reg_fixed_bits(kvm, HFGWTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGWTR_EL2, res0, res1); + + /* HDFG[RW]TR_EL2 */ + get_reg_fixed_bits(kvm, HDFGRTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGRTR_EL2, res0, res1); + get_reg_fixed_bits(kvm, HDFGWTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGWTR_EL2, res0, res1); + + /* HFGITR_EL2 */ + get_reg_fixed_bits(kvm, HFGITR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGITR_EL2, res0, res1); + + /* HAFGRTR_EL2 - not a lot to see here */ + get_reg_fixed_bits(kvm, HAFGRTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); + + /* HFG[RW]TR2_EL2 */ + get_reg_fixed_bits(kvm, HFGRTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGRTR2_EL2, res0, res1); + get_reg_fixed_bits(kvm, HFGWTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGWTR2_EL2, res0, res1); + + /* HDFG[RW]TR2_EL2 */ + get_reg_fixed_bits(kvm, HDFGRTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGRTR2_EL2, res0, res1); + get_reg_fixed_bits(kvm, HDFGWTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGWTR2_EL2, res0, res1); + + /* HFGITR2_EL2 */ + get_reg_fixed_bits(kvm, HFGITR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1); + + /* TCR2_EL2 */ + get_reg_fixed_bits(kvm, TCR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, TCR2_EL2, res0, res1); + + /* SCTLR_EL1 */ + get_reg_fixed_bits(kvm, SCTLR_EL1, &res0, &res1); + set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + + /* SCTLR2_ELx */ + get_reg_fixed_bits(kvm, SCTLR2_EL1, &res0, &res1); + set_sysreg_masks(kvm, SCTLR2_EL1, res0, res1); + get_reg_fixed_bits(kvm, SCTLR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, SCTLR2_EL2, res0, res1); + + /* MDCR_EL2 */ + get_reg_fixed_bits(kvm, MDCR_EL2, &res0, &res1); + set_sysreg_masks(kvm, MDCR_EL2, res0, res1); + + /* CNTHCTL_EL2 */ + res0 = GENMASK(63, 20); + res1 = 0; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP)) + res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) { + res0 |= CNTHCTL_ECV; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP)) + res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT | + CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT); + } + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= GENMASK(11, 8); + set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1); + + /* ICH_HCR_EL2 */ + res0 = ICH_HCR_EL2_RES0; + res1 = ICH_HCR_EL2_RES1; + if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS)) + res0 |= ICH_HCR_EL2_TDIR; + /* No GICv4 is presented to the guest */ + res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount; + set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1); + + /* VNCR_EL2 */ + set_sysreg_masks(kvm, VNCR_EL2, VNCR_EL2_RES0, VNCR_EL2_RES1); + +out: + for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) + __vcpu_rmw_sys_reg(vcpu, sr, |=, 0); + + return 0; +} + +void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) { + struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; + + write_lock(&vcpu->kvm->mmu_lock); + if (mmu->pending_unmap) { + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true); + mmu->pending_unmap = false; + } + write_unlock(&vcpu->kvm->mmu_lock); + } + + if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu)) + kvm_map_l1_vncr(vcpu); + + /* Must be last, as may switch context! */ + if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu)) + kvm_inject_nested_irq(vcpu); +} + +/* + * One of the many architectural bugs in FEAT_NV2 is that the guest hypervisor + * can write to HCR_EL2 behind our back, potentially changing the exception + * routing / masking for even the host context. + * + * What follows is some slop to (1) react to exception routing / masking and (2) + * preserve the pending SError state across translation regimes. + */ +void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu) +{ + if (!vcpu_has_nv(vcpu)) + return; + + if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING))) + kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu)); +} + +void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu) +{ + unsigned long *hcr = vcpu_hcr(vcpu); + + if (!vcpu_has_nv(vcpu)) + return; + + /* + * We previously decided that an SError was deliverable to the guest. + * Reap the pending state from HCR_EL2 and... + */ + if (unlikely(__test_and_clear_bit(__ffs(HCR_VSE), hcr))) + vcpu_set_flag(vcpu, NESTED_SERROR_PENDING); + + /* + * Re-attempt SError injection in case the deliverability has changed, + * which is necessary to faithfully emulate WFI the case of a pending + * SError being a wakeup condition. + */ + if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING))) + kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu)); +} + +/* + * KVM unconditionally sets most of these traps anyway but use an allowlist + * to document the guest hypervisor traps that may take precedence and guard + * against future changes to the non-nested trap configuration. + */ +#define NV_MDCR_GUEST_INCLUDE (MDCR_EL2_TDE | \ + MDCR_EL2_TDA | \ + MDCR_EL2_TDRA | \ + MDCR_EL2_TTRF | \ + MDCR_EL2_TPMS | \ + MDCR_EL2_TPM | \ + MDCR_EL2_TPMCR | \ + MDCR_EL2_TDCC | \ + MDCR_EL2_TDOSA) + +void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu) +{ + u64 guest_mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (is_nested_ctxt(vcpu)) + vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE); + /* + * In yet another example where FEAT_NV2 is fscking broken, accesses + * to MDSCR_EL1 are redirected to the VNCR despite having an effect + * at EL2. Use a big hammer to apply sanity. + * + * Unless of course we have FEAT_FGT, in which case we can precisely + * trap MDSCR_EL1. + */ + else if (!cpus_have_final_cap(ARM64_HAS_FGT)) + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; +} diff --git a/arch/arm64/kvm/pauth.c b/arch/arm64/kvm/pauth.c new file mode 100644 index 000000000000..d5eb3ae876be --- /dev/null +++ b/arch/arm64/kvm/pauth.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024 - Google LLC + * Author: Marc Zyngier <maz@kernel.org> + * + * Primitive PAuth emulation for ERETAA/ERETAB. + * + * This code assumes that is is run from EL2, and that it is part of + * the emulation of ERETAx for a guest hypervisor. That's a lot of + * baked-in assumptions and shortcuts. + * + * Do no reuse for anything else! + */ + +#include <linux/kvm_host.h> + +#include <asm/gpr-num.h> +#include <asm/kvm_emulate.h> +#include <asm/pointer_auth.h> + +/* PACGA Xd, Xn, Xm */ +#define PACGA(d,n,m) \ + asm volatile(__DEFINE_ASM_GPR_NUMS \ + ".inst 0x9AC03000 |" \ + "(.L__gpr_num_%[Rd] << 0) |" \ + "(.L__gpr_num_%[Rn] << 5) |" \ + "(.L__gpr_num_%[Rm] << 16)\n" \ + : [Rd] "=r" ((d)) \ + : [Rn] "r" ((n)), [Rm] "r" ((m))) + +static u64 compute_pac(struct kvm_vcpu *vcpu, u64 ptr, + struct ptrauth_key ikey) +{ + struct ptrauth_key gkey; + u64 mod, pac = 0; + + preempt_disable(); + + if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) + mod = __vcpu_sys_reg(vcpu, SP_EL2); + else + mod = read_sysreg(sp_el1); + + gkey.lo = read_sysreg_s(SYS_APGAKEYLO_EL1); + gkey.hi = read_sysreg_s(SYS_APGAKEYHI_EL1); + + __ptrauth_key_install_nosync(APGA, ikey); + isb(); + + PACGA(pac, ptr, mod); + isb(); + + __ptrauth_key_install_nosync(APGA, gkey); + + preempt_enable(); + + /* PAC in the top 32bits */ + return pac; +} + +static bool effective_tbi(struct kvm_vcpu *vcpu, bool bit55) +{ + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + bool tbi, tbid; + + /* + * Since we are authenticating an instruction address, we have + * to take TBID into account. If E2H==0, ignore VA[55], as + * TCR_EL2 only has a single TBI/TBID. If VA[55] was set in + * this case, this is likely a guest bug... + */ + if (!vcpu_el2_e2h_is_set(vcpu)) { + tbi = tcr & BIT(20); + tbid = tcr & BIT(29); + } else if (bit55) { + tbi = tcr & TCR_TBI1; + tbid = tcr & TCR_TBID1; + } else { + tbi = tcr & TCR_TBI0; + tbid = tcr & TCR_TBID0; + } + + return tbi && !tbid; +} + +static int compute_bottom_pac(struct kvm_vcpu *vcpu, bool bit55) +{ + static const int maxtxsz = 39; // Revisit these two values once + static const int mintxsz = 16; // (if) we support TTST/LVA/LVA2 + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + int txsz; + + if (!vcpu_el2_e2h_is_set(vcpu) || !bit55) + txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + else + txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + + return 64 - clamp(txsz, mintxsz, maxtxsz); +} + +static u64 compute_pac_mask(struct kvm_vcpu *vcpu, bool bit55) +{ + int bottom_pac; + u64 mask; + + bottom_pac = compute_bottom_pac(vcpu, bit55); + + mask = GENMASK(54, bottom_pac); + if (!effective_tbi(vcpu, bit55)) + mask |= GENMASK(63, 56); + + return mask; +} + +static u64 to_canonical_addr(struct kvm_vcpu *vcpu, u64 ptr, u64 mask) +{ + bool bit55 = !!(ptr & BIT(55)); + + if (bit55) + return ptr | mask; + + return ptr & ~mask; +} + +static u64 corrupt_addr(struct kvm_vcpu *vcpu, u64 ptr) +{ + bool bit55 = !!(ptr & BIT(55)); + u64 mask, error_code; + int shift; + + if (effective_tbi(vcpu, bit55)) { + mask = GENMASK(54, 53); + shift = 53; + } else { + mask = GENMASK(62, 61); + shift = 61; + } + + if (esr_iss_is_eretab(kvm_vcpu_get_esr(vcpu))) + error_code = 2 << shift; + else + error_code = 1 << shift; + + ptr &= ~mask; + ptr |= error_code; + + return ptr; +} + +/* + * Authenticate an ERETAA/ERETAB instruction, returning true if the + * authentication succeeded and false otherwise. In all cases, *elr + * contains the VA to ERET to. Potential exception injection is left + * to the caller. + */ +bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr) +{ + u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 ptr, cptr, pac, mask; + struct ptrauth_key ikey; + + *elr = ptr = vcpu_read_sys_reg(vcpu, ELR_EL2); + + /* We assume we're already in the context of an ERETAx */ + if (esr_iss_is_eretab(esr)) { + if (!(sctlr & SCTLR_EL1_EnIB)) + return true; + + ikey.lo = __vcpu_sys_reg(vcpu, APIBKEYLO_EL1); + ikey.hi = __vcpu_sys_reg(vcpu, APIBKEYHI_EL1); + } else { + if (!(sctlr & SCTLR_EL1_EnIA)) + return true; + + ikey.lo = __vcpu_sys_reg(vcpu, APIAKEYLO_EL1); + ikey.hi = __vcpu_sys_reg(vcpu, APIAKEYHI_EL1); + } + + mask = compute_pac_mask(vcpu, !!(ptr & BIT(55))); + cptr = to_canonical_addr(vcpu, ptr, mask); + + pac = compute_pac(vcpu, cptr, ikey); + + /* + * Slightly deviate from the pseudocode: if we have a PAC + * match with the signed pointer, then it must be good. + * Anything after this point is pure error handling. + */ + if ((pac & mask) == (ptr & mask)) { + *elr = cptr; + return true; + } + + /* + * Authentication failed, corrupt the canonical address if + * PAuth2 isn't implemented, or some XORing if it is. + */ + if (!kvm_has_pauth(vcpu->kvm, PAuth2)) + cptr = corrupt_addr(vcpu, cptr); + else + cptr = ptr ^ (pac & mask); + + *elr = cptr; + return false; +} diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c new file mode 100644 index 000000000000..d7a0f69a9982 --- /dev/null +++ b/arch/arm64/kvm/pkvm.c @@ -0,0 +1,487 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 - Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <linux/init.h> +#include <linux/interval_tree_generic.h> +#include <linux/kmemleak.h> +#include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> +#include <linux/memblock.h> +#include <linux/mutex.h> + +#include <asm/kvm_pkvm.h> + +#include "hyp_constants.h" + +DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); + +static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); +static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); + +phys_addr_t hyp_mem_base; +phys_addr_t hyp_mem_size; + +static int __init register_memblock_regions(void) +{ + struct memblock_region *reg; + + for_each_mem_region(reg) { + if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) + return -ENOMEM; + + hyp_memory[*hyp_memblock_nr_ptr] = *reg; + (*hyp_memblock_nr_ptr)++; + } + + return 0; +} + +void __init kvm_hyp_reserve(void) +{ + u64 hyp_mem_pages = 0; + int ret; + + if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) + return; + + if (kvm_get_mode() != KVM_MODE_PROTECTED) + return; + + ret = register_memblock_regions(); + if (ret) { + *hyp_memblock_nr_ptr = 0; + kvm_err("Failed to register hyp memblocks: %d\n", ret); + return; + } + + hyp_mem_pages += hyp_s1_pgtable_pages(); + hyp_mem_pages += host_s2_pgtable_pages(); + hyp_mem_pages += hyp_vm_table_pages(); + hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); + hyp_mem_pages += pkvm_selftest_pages(); + hyp_mem_pages += hyp_ffa_proxy_pages(); + + /* + * Try to allocate a PMD-aligned region to reduce TLB pressure once + * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. + */ + hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; + hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), + PMD_SIZE); + if (!hyp_mem_base) + hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); + else + hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); + + if (!hyp_mem_base) { + kvm_err("Failed to reserve hyp memory\n"); + return; + } + + kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, + hyp_mem_base); +} + +static void __pkvm_destroy_hyp_vm(struct kvm *kvm) +{ + if (pkvm_hyp_vm_is_created(kvm)) { + WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, + kvm->arch.pkvm.handle)); + } else if (kvm->arch.pkvm.handle) { + /* + * The VM could have been reserved but hyp initialization has + * failed. Make sure to unreserve it. + */ + kvm_call_hyp_nvhe(__pkvm_unreserve_vm, kvm->arch.pkvm.handle); + } + + kvm->arch.pkvm.handle = 0; + kvm->arch.pkvm.is_created = false; + free_hyp_memcache(&kvm->arch.pkvm.teardown_mc); + free_hyp_memcache(&kvm->arch.pkvm.stage2_teardown_mc); +} + +static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) +{ + size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); + pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle; + void *hyp_vcpu; + int ret; + + vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + + hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vcpu) + return -ENOMEM; + + ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu); + if (!ret) + vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED); + else + free_pages_exact(hyp_vcpu, hyp_vcpu_sz); + + return ret; +} + +/* + * Allocates and donates memory for hypervisor VM structs at EL2. + * + * Allocates space for the VM state, which includes the hyp vm as well as + * the hyp vcpus. + * + * Stores an opaque handler in the kvm struct for future reference. + * + * Return 0 on success, negative error code on failure. + */ +static int __pkvm_create_hyp_vm(struct kvm *kvm) +{ + size_t pgd_sz, hyp_vm_sz; + void *pgd, *hyp_vm; + int ret; + + if (kvm->created_vcpus < 1) + return -EINVAL; + + pgd_sz = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr); + + /* + * The PGD pages will be reclaimed using a hyp_memcache which implies + * page granularity. So, use alloc_pages_exact() to get individual + * refcounts. + */ + pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT); + if (!pgd) + return -ENOMEM; + + /* Allocate memory to donate to hyp for vm and vcpu pointers. */ + hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, + size_mul(sizeof(void *), + kvm->created_vcpus))); + hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vm) { + ret = -ENOMEM; + goto free_pgd; + } + + /* Donate the VM memory to hyp and let hyp initialize it. */ + ret = kvm_call_hyp_nvhe(__pkvm_init_vm, kvm, hyp_vm, pgd); + if (ret) + goto free_vm; + + kvm->arch.pkvm.is_created = true; + kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); + + return 0; +free_vm: + free_pages_exact(hyp_vm, hyp_vm_sz); +free_pgd: + free_pages_exact(pgd, pgd_sz); + return ret; +} + +bool pkvm_hyp_vm_is_created(struct kvm *kvm) +{ + return READ_ONCE(kvm->arch.pkvm.is_created); +} + +int pkvm_create_hyp_vm(struct kvm *kvm) +{ + int ret = 0; + + mutex_lock(&kvm->arch.config_lock); + if (!pkvm_hyp_vm_is_created(kvm)) + ret = __pkvm_create_hyp_vm(kvm); + mutex_unlock(&kvm->arch.config_lock); + + return ret; +} + +int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) +{ + int ret = 0; + + mutex_lock(&vcpu->kvm->arch.config_lock); + if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED)) + ret = __pkvm_create_hyp_vcpu(vcpu); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return ret; +} + +void pkvm_destroy_hyp_vm(struct kvm *kvm) +{ + mutex_lock(&kvm->arch.config_lock); + __pkvm_destroy_hyp_vm(kvm); + mutex_unlock(&kvm->arch.config_lock); +} + +int pkvm_init_host_vm(struct kvm *kvm) +{ + int ret; + + if (pkvm_hyp_vm_is_created(kvm)) + return -EINVAL; + + /* VM is already reserved, no need to proceed. */ + if (kvm->arch.pkvm.handle) + return 0; + + /* Reserve the VM in hyp and obtain a hyp handle for the VM. */ + ret = kvm_call_hyp_nvhe(__pkvm_reserve_vm); + if (ret < 0) + return ret; + + kvm->arch.pkvm.handle = ret; + + return 0; +} + +static void __init _kvm_host_prot_finalize(void *arg) +{ + int *err = arg; + + if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) + WRITE_ONCE(*err, -EINVAL); +} + +static int __init pkvm_drop_host_privileges(void) +{ + int ret = 0; + + /* + * Flip the static key upfront as that may no longer be possible + * once the host stage 2 is installed. + */ + static_branch_enable(&kvm_protected_mode_initialized); + on_each_cpu(_kvm_host_prot_finalize, &ret, 1); + return ret; +} + +static int __init finalize_pkvm(void) +{ + int ret; + + if (!is_protected_kvm_enabled() || !is_kvm_arm_initialised()) + return 0; + + /* + * Exclude HYP sections from kmemleak so that they don't get peeked + * at, which would end badly once inaccessible. + */ + kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); + kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start); + kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start); + kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); + + ret = pkvm_drop_host_privileges(); + if (ret) + pr_err("Failed to finalize Hyp protection: %d\n", ret); + + return ret; +} +device_initcall_sync(finalize_pkvm); + +static u64 __pkvm_mapping_start(struct pkvm_mapping *m) +{ + return m->gfn * PAGE_SIZE; +} + +static u64 __pkvm_mapping_end(struct pkvm_mapping *m) +{ + return (m->gfn + m->nr_pages) * PAGE_SIZE - 1; +} + +INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last, + __pkvm_mapping_start, __pkvm_mapping_end, static, + pkvm_mapping); + +/* + * __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow + * freeing of __map inline. + */ +#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \ + for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings, \ + __start, __end - 1); \ + __tmp && ({ \ + __map = __tmp; \ + __tmp = pkvm_mapping_iter_next(__map, __start, __end - 1); \ + true; \ + }); \ + ) + +int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) +{ + pgt->pkvm_mappings = RB_ROOT_CACHED; + pgt->mmu = mmu; + + return 0; +} + +static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret; + + if (!handle) + return 0; + + for_each_mapping_in_range_safe(pgt, start, end, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn, + mapping->nr_pages); + if (WARN_ON(ret)) + return ret; + pkvm_mapping_remove(mapping, &pgt->pkvm_mappings); + kfree(mapping); + } + + return 0; +} + +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) +{ + __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); +} + +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + /* Expected to be called after all pKVM mappings have been released. */ + WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); +} + +int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + void *mc, enum kvm_pgtable_walk_flags flags) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping = NULL; + struct kvm_hyp_memcache *cache = mc; + u64 gfn = addr >> PAGE_SHIFT; + u64 pfn = phys >> PAGE_SHIFT; + int ret; + + if (size != PAGE_SIZE && size != PMD_SIZE) + return -EINVAL; + + lockdep_assert_held_write(&kvm->mmu_lock); + + /* + * Calling stage2_map() on top of existing mappings is either happening because of a race + * with another vCPU, or because we're changing between page and block mappings. As per + * user_mem_abort(), same-size permission faults are handled in the relax_perms() path. + */ + mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1); + if (mapping) { + if (size == (mapping->nr_pages * PAGE_SIZE)) + return -EAGAIN; + + /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */ + ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); + if (ret) + return ret; + mapping = NULL; + } + + ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot); + if (WARN_ON(ret)) + return ret; + + swap(mapping, cache->mapping); + mapping->gfn = gfn; + mapping->pfn = pfn; + mapping->nr_pages = size / PAGE_SIZE; + pkvm_mapping_insert(mapping, &pgt->pkvm_mappings); + + return ret; +} + +int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock); + + return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); +} + +int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret = 0; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn, + mapping->nr_pages); + if (WARN_ON(ret)) + break; + } + + return ret; +} + +int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), + PAGE_SIZE * mapping->nr_pages); + + return 0; +} + +bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + bool young = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, + mapping->nr_pages, mkold); + + return young; +} + +int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, + enum kvm_pgtable_walk_flags flags) +{ + return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot); +} + +void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) +{ + WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT)); +} + +void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) +{ + WARN_ON_ONCE(1); +} + +kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, + enum kvm_pgtable_prot prot, void *mc, bool force_pte) +{ + WARN_ON_ONCE(1); + return NULL; +} + +int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c new file mode 100644 index 000000000000..b03dbda7f1ab --- /dev/null +++ b/arch/arm64/kvm/pmu-emul.c @@ -0,0 +1,1335 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Linaro Ltd. + * Author: Shannon Zhao <shannon.zhao@linaro.org> + */ + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <linux/perf_event.h> +#include <linux/perf/arm_pmu.h> +#include <linux/uaccess.h> +#include <asm/kvm_emulate.h> +#include <kvm/arm_pmu.h> +#include <kvm/arm_vgic.h> + +#define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0) + +static LIST_HEAD(arm_pmus); +static DEFINE_MUTEX(arm_pmus_lock); + +static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); +static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); +static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc); + +bool kvm_supports_guest_pmuv3(void) +{ + guard(mutex)(&arm_pmus_lock); + return !list_empty(&arm_pmus); +} + +static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) +{ + return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]); +} + +static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx) +{ + return &vcpu->arch.pmu.pmc[cnt_idx]; +} + +static u32 __kvm_pmu_event_mask(unsigned int pmuver) +{ + switch (pmuver) { + case ID_AA64DFR0_EL1_PMUVer_IMP: + return GENMASK(9, 0); + case ID_AA64DFR0_EL1_PMUVer_V3P1: + case ID_AA64DFR0_EL1_PMUVer_V3P4: + case ID_AA64DFR0_EL1_PMUVer_V3P5: + case ID_AA64DFR0_EL1_PMUVer_V3P7: + return GENMASK(15, 0); + default: /* Shouldn't be here, just for sanity */ + WARN_ONCE(1, "Unknown PMU version %d\n", pmuver); + return 0; + } +} + +static u32 kvm_pmu_event_mask(struct kvm *kvm) +{ + u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); + u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0); + + return __kvm_pmu_event_mask(pmuver); +} + +u64 kvm_pmu_evtyper_mask(struct kvm *kvm) +{ + u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 | + kvm_pmu_event_mask(kvm); + + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP)) + mask |= ARMV8_PMU_INCLUDE_EL2; + + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) + mask |= ARMV8_PMU_EXCLUDE_NS_EL0 | + ARMV8_PMU_EXCLUDE_NS_EL1 | + ARMV8_PMU_EXCLUDE_EL3; + + return mask; +} + +/** + * kvm_pmc_is_64bit - determine if counter is 64bit + * @pmc: counter context + */ +static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + + return (pmc->idx == ARMV8_PMU_CYCLE_IDX || + kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)); +} + +static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 val = kvm_vcpu_read_pmcr(vcpu); + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP; + + return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || + (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); +} + +static bool kvm_pmu_counter_can_chain(struct kvm_pmc *pmc) +{ + return (!(pmc->idx & 1) && (pmc->idx + 1) < ARMV8_PMU_CYCLE_IDX && + !kvm_pmc_has_64bit_overflow(pmc)); +} + +static u32 counter_index_to_reg(u64 idx) +{ + return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + idx; +} + +static u32 counter_index_to_evtreg(u64 idx) +{ + return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; +} + +static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc) +{ + return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx)); +} + +static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 counter, reg, enabled, running; + + reg = counter_index_to_reg(pmc->idx); + counter = __vcpu_sys_reg(vcpu, reg); + + /* + * The real counter value is equal to the value of counter register plus + * the value perf event counts. + */ + if (pmc->perf_event) + counter += perf_event_read_value(pmc->perf_event, &enabled, + &running); + + if (!kvm_pmc_is_64bit(pmc)) + counter = lower_32_bits(counter); + + return counter; +} + +/** + * kvm_pmu_get_counter_value - get PMU counter value + * @vcpu: The vcpu pointer + * @select_idx: The counter index + */ +u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) +{ + return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); +} + +static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 reg; + + kvm_pmu_release_perf_event(pmc); + + reg = counter_index_to_reg(pmc->idx); + + if (vcpu_mode_is_32bit(vcpu) && pmc->idx != ARMV8_PMU_CYCLE_IDX && + !force) { + /* + * Even with PMUv3p5, AArch32 cannot write to the top + * 32bit of the counters. The only possible course of + * action is to use PMCR.P, which will reset them to + * 0 (the only use of the 'force' parameter). + */ + val = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32); + val |= lower_32_bits(val); + } + + __vcpu_assign_sys_reg(vcpu, reg, val); + + /* Recreate the perf event to reflect the updated sample_period */ + kvm_pmu_create_perf_event(pmc); +} + +/** + * kvm_pmu_set_counter_value - set PMU counter value + * @vcpu: The vcpu pointer + * @select_idx: The counter index + * @val: The counter value + */ +void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) +{ + kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false); +} + +/** + * kvm_pmu_set_counter_value_user - set PMU counter value from user + * @vcpu: The vcpu pointer + * @select_idx: The counter index + * @val: The counter value + */ +void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) +{ + kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); + __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(select_idx), val); + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); +} + +/** + * kvm_pmu_release_perf_event - remove the perf event + * @pmc: The PMU counter pointer + */ +static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + perf_event_disable(pmc->perf_event); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +/** + * kvm_pmu_stop_counter - stop PMU counter + * @pmc: The PMU counter pointer + * + * If this counter has been configured to monitor some event, release it here. + */ +static void kvm_pmu_stop_counter(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 reg, val; + + if (!pmc->perf_event) + return; + + val = kvm_pmu_get_pmc_value(pmc); + + reg = counter_index_to_reg(pmc->idx); + + __vcpu_assign_sys_reg(vcpu, reg, val); + + kvm_pmu_release_perf_event(pmc); +} + +/** + * kvm_pmu_vcpu_init - assign pmu counter idx for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) + pmu->pmc[i].idx = i; +} + +/** + * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) + kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); + irq_work_sync(&vcpu->arch.pmu.overflow_work); +} + +static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu) +{ + unsigned int hpmn, n; + + if (!vcpu_has_nv(vcpu)) + return 0; + + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + n = vcpu->kvm->arch.nr_pmu_counters; + + /* + * Programming HPMN to a value greater than PMCR_EL0.N is + * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an + * UNKNOWN number of counters (in our case, zero) are reserved for EL2. + */ + if (hpmn >= n) + return 0; + + /* + * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't + * implemented. Since KVM's ability to emulate HPMN=0 does not directly + * depend on hardware (all PMU registers are trapped), make the + * implementation choice that all counters are included in the second + * range reserved for EL2/EL3. + */ + return GENMASK(n - 1, hpmn); +} + +bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) +{ + return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx); +} + +u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + + if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) + return mask; + + return mask & ~kvm_pmu_hyp_counter_mask(vcpu); +} + +u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); + + if (val == 0) + return BIT(ARMV8_PMU_CYCLE_IDX); + else + return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); +} + +static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc) +{ + if (!pmc->perf_event) { + kvm_pmu_create_perf_event(pmc); + return; + } + + perf_event_enable(pmc->perf_event); + if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) + kvm_debug("fail to enable perf event\n"); +} + +static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) + perf_event_disable(pmc->perf_event); +} + +void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) +{ + int i; + + if (!val) + return; + + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + + if (!(val & BIT(i))) + continue; + + if (kvm_pmu_counter_is_enabled(pmc)) + kvm_pmc_enable_perf_event(pmc); + else + kvm_pmc_disable_perf_event(pmc); + } + + kvm_vcpu_pmu_restore_guest(vcpu); +} + +/* + * Returns the PMU overflow state, which is true if there exists an event + * counter where the values of the global enable control, PMOVSSET_EL0[n], and + * PMINTENSET_EL1[n] are all 1. + */ +static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) +{ + u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); + + reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); + + /* + * PMCR_EL0.E is the global enable control for event counters available + * to EL0 and EL1. + */ + if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) + reg &= kvm_pmu_hyp_counter_mask(vcpu); + + /* + * Otherwise, MDCR_EL2.HPME is the global enable control for event + * counters reserved for EL2. + */ + if (!(vcpu_read_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HPME)) + reg &= ~kvm_pmu_hyp_counter_mask(vcpu); + + return reg; +} + +static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool overflow; + + overflow = kvm_pmu_overflow_status(vcpu); + if (pmu->irq_level == overflow) + return; + + pmu->irq_level = overflow; + + if (likely(irqchip_in_kernel(vcpu->kvm))) { + int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, + pmu->irq_num, overflow, pmu); + WARN_ON(ret); + } +} + +bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + return pmu->irq_level != run_level; +} + +/* + * Reflect the PMU overflow interrupt output level into the kvm_run structure + */ +void kvm_pmu_update_run(struct kvm_vcpu *vcpu) +{ + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Populate the timer bitmap for user space */ + regs->device_irq_level &= ~KVM_ARM_DEV_PMU; + if (vcpu->arch.pmu.irq_level) + regs->device_irq_level |= KVM_ARM_DEV_PMU; +} + +/** + * kvm_pmu_flush_hwstate - flush pmu state to cpu + * @vcpu: The vcpu pointer + * + * Check if the PMU has overflowed while we were running in the host, and inject + * an interrupt if that was the case. + */ +void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) +{ + kvm_pmu_update_state(vcpu); +} + +/** + * kvm_pmu_sync_hwstate - sync pmu state from cpu + * @vcpu: The vcpu pointer + * + * Check if the PMU has overflowed while we were running in the guest, and + * inject an interrupt if that was the case. + */ +void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) +{ + kvm_pmu_update_state(vcpu); +} + +/* + * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding + * to the event. + * This is why we need a callback to do it once outside of the NMI context. + */ +static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work); + kvm_vcpu_kick(vcpu); +} + +/* + * Perform an increment on any of the counters described in @mask, + * generating the overflow if required, and propagate it as a chained + * event if possible. + */ +static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, + unsigned long mask, u32 event) +{ + int i; + + if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) + return; + + /* Weed out disabled counters */ + mask &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + + for_each_set_bit(i, &mask, ARMV8_PMU_CYCLE_IDX) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + u64 type, reg; + + /* Filter on event type */ + type = __vcpu_sys_reg(vcpu, counter_index_to_evtreg(i)); + type &= kvm_pmu_event_mask(vcpu->kvm); + if (type != event) + continue; + + /* Increment this counter */ + reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1; + if (!kvm_pmc_is_64bit(pmc)) + reg = lower_32_bits(reg); + __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(i), reg); + + /* No overflow? move on */ + if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg)) + continue; + + /* Mark overflow */ + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(i)); + + if (kvm_pmu_counter_can_chain(pmc)) + kvm_pmu_counter_increment(vcpu, BIT(i + 1), + ARMV8_PMUV3_PERFCTR_CHAIN); + } +} + +/* Compute the sample period for a given counter value */ +static u64 compute_period(struct kvm_pmc *pmc, u64 counter) +{ + u64 val; + + if (kvm_pmc_is_64bit(pmc) && kvm_pmc_has_64bit_overflow(pmc)) + val = (-counter) & GENMASK(63, 0); + else + val = (-counter) & GENMASK(31, 0); + + return val; +} + +/* + * When the perf event overflows, set the overflow status and inform the vcpu. + */ +static void kvm_pmu_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu); + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + int idx = pmc->idx; + u64 period; + + cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE); + + /* + * Reset the sample period to the architectural limit, + * i.e. the point where the counter overflows. + */ + period = compute_period(pmc, local64_read(&perf_event->count)); + + local64_set(&perf_event->hw.period_left, 0); + perf_event->attr.sample_period = period; + perf_event->hw.sample_period = period; + + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(idx)); + + if (kvm_pmu_counter_can_chain(pmc)) + kvm_pmu_counter_increment(vcpu, BIT(idx + 1), + ARMV8_PMUV3_PERFCTR_CHAIN); + + if (kvm_pmu_overflow_status(vcpu)) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + + if (!in_nmi()) + kvm_vcpu_kick(vcpu); + else + irq_work_queue(&vcpu->arch.pmu.overflow_work); + } + + cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD); +} + +/** + * kvm_pmu_software_increment - do software increment + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMSWINC register + */ +void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) +{ + kvm_pmu_counter_increment(vcpu, val, ARMV8_PMUV3_PERFCTR_SW_INCR); +} + +/** + * kvm_pmu_handle_pmcr - handle PMCR register + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMCR register + */ +void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) +{ + int i; + + /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ + if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) + val &= ~ARMV8_PMU_PMCR_LP; + + /* Request a reload of the PMU to enable/disable affected counters */ + if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + /* The reset bits don't indicate any state, and shouldn't be saved. */ + __vcpu_assign_sys_reg(vcpu, PMCR_EL0, (val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P))); + + if (val & ARMV8_PMU_PMCR_C) + kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); + + if (val & ARMV8_PMU_PMCR_P) { + unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) & + ~BIT(ARMV8_PMU_CYCLE_IDX); + + if (!vcpu_is_el2(vcpu)) + mask &= ~kvm_pmu_hyp_counter_mask(vcpu); + + for_each_set_bit(i, &mask, 32) + kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); + } +} + +static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx))) + return false; + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return mdcr & MDCR_EL2_HPME; + + return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E; +} + +static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0; + bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0; + + return u == nsu; +} + +static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1; + bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1; + + return p == nsk; +} + +static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD)) + return false; + + return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2; +} + +static int kvm_map_pmu_event(struct kvm *kvm, unsigned int eventsel) +{ + struct arm_pmu *pmu = kvm->arch.arm_pmu; + + /* + * The CPU PMU likely isn't PMUv3; let the driver provide a mapping + * for the guest's PMUv3 event ID. + */ + if (unlikely(pmu->map_pmuv3_event)) + return pmu->map_pmuv3_event(eventsel); + + return eventsel; +} + +/** + * kvm_pmu_create_perf_event - create a perf event for a counter + * @pmc: Counter context + */ +static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; + struct perf_event *event; + struct perf_event_attr attr; + int eventsel; + u64 evtreg; + + evtreg = kvm_pmc_read_evtreg(pmc); + + kvm_pmu_stop_counter(pmc); + if (pmc->idx == ARMV8_PMU_CYCLE_IDX) + eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; + else + eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm); + + /* + * Neither SW increment nor chained events need to be backed + * by a perf event. + */ + if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR || + eventsel == ARMV8_PMUV3_PERFCTR_CHAIN) + return; + + /* + * If we have a filter in place and that the event isn't allowed, do + * not install a perf event either. + */ + if (vcpu->kvm->arch.pmu_filter && + !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) + return; + + /* + * Don't create an event if we're running on hardware that requires + * PMUv3 event translation and we couldn't find a valid mapping. + */ + eventsel = kvm_map_pmu_event(vcpu->kvm, eventsel); + if (eventsel < 0) + return; + + memset(&attr, 0, sizeof(struct perf_event_attr)); + attr.type = arm_pmu->pmu.type; + attr.size = sizeof(attr); + attr.pinned = 1; + attr.disabled = !kvm_pmu_counter_is_enabled(pmc); + attr.exclude_user = !kvm_pmc_counts_at_el0(pmc); + attr.exclude_hv = 1; /* Don't count EL2 events */ + attr.exclude_host = 1; /* Don't count host events */ + attr.config = eventsel; + + /* + * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the + * guest's EL2 filter. + */ + if (unlikely(is_hyp_ctxt(vcpu))) + attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc); + else + attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); + + /* + * If counting with a 64bit counter, advertise it to the perf + * code, carefully dealing with the initial sample period + * which also depends on the overflow. + */ + if (kvm_pmc_is_64bit(pmc)) + attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT; + + attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc)); + + event = perf_event_create_kernel_counter(&attr, -1, current, + kvm_pmu_perf_overflow, pmc); + + if (IS_ERR(event)) { + pr_err_once("kvm: pmu event creation failed %ld\n", + PTR_ERR(event)); + return; + } + + pmc->perf_event = event; +} + +/** + * kvm_pmu_set_counter_event_type - set selected counter to monitor some event + * @vcpu: The vcpu pointer + * @data: The data guest writes to PMXEVTYPER_EL0 + * @select_idx: The number of selected counter + * + * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an + * event with given hardware event number. Here we call perf_event API to + * emulate this action and create a kernel perf event for it. + */ +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, + u64 select_idx) +{ + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx); + u64 reg; + + reg = counter_index_to_evtreg(pmc->idx); + __vcpu_assign_sys_reg(vcpu, reg, (data & kvm_pmu_evtyper_mask(vcpu->kvm))); + + kvm_pmu_create_perf_event(pmc); +} + +void kvm_host_pmu_init(struct arm_pmu *pmu) +{ + struct arm_pmu_entry *entry; + + /* + * Check the sanitised PMU version for the system, as KVM does not + * support implementations where PMUv3 exists on a subset of CPUs. + */ + if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit())) + return; + + guard(mutex)(&arm_pmus_lock); + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + entry->arm_pmu = pmu; + list_add_tail(&entry->entry, &arm_pmus); +} + +static struct arm_pmu *kvm_pmu_probe_armpmu(void) +{ + struct arm_pmu_entry *entry; + struct arm_pmu *pmu; + int cpu; + + guard(mutex)(&arm_pmus_lock); + + /* + * It is safe to use a stale cpu to iterate the list of PMUs so long as + * the same value is used for the entirety of the loop. Given this, and + * the fact that no percpu data is used for the lookup there is no need + * to disable preemption. + * + * It is still necessary to get a valid cpu, though, to probe for the + * default PMU instance as userspace is not required to specify a PMU + * type. In order to uphold the preexisting behavior KVM selects the + * PMU instance for the core during vcpu init. A dependent use + * case would be a user with disdain of all things big.LITTLE that + * affines the VMM to a particular cluster of cores. + * + * In any case, userspace should just do the sane thing and use the UAPI + * to select a PMU type directly. But, be wary of the baggage being + * carried here. + */ + cpu = raw_smp_processor_id(); + list_for_each_entry(entry, &arm_pmus, entry) { + pmu = entry->arm_pmu; + + if (cpumask_test_cpu(cpu, &pmu->supported_cpus)) + return pmu; + } + + return NULL; +} + +static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1) +{ + u32 hi[2], lo[2]; + + bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); + bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); + + return ((u64)hi[pmceid1] << 32) | lo[pmceid1]; +} + +static u64 compute_pmceid0(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 0); + + /* always support SW_INCR */ + val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR); + /* always support CHAIN */ + val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); + return val; +} + +static u64 compute_pmceid1(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 1); + + /* + * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled + * as RAZ + */ + val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); + return val; +} + +u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) +{ + struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu; + unsigned long *bmap = vcpu->kvm->arch.pmu_filter; + u64 val, mask = 0; + int base, i, nr_events; + + if (!pmceid1) { + val = compute_pmceid0(cpu_pmu); + base = 0; + } else { + val = compute_pmceid1(cpu_pmu); + base = 32; + } + + if (!bmap) + return val; + + nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1; + + for (i = 0; i < 32; i += 8) { + u64 byte; + + byte = bitmap_get_value8(bmap, base + i); + mask |= byte << i; + if (nr_events >= (0x4000 + base + 32)) { + byte = bitmap_get_value8(bmap, 0x4000 + base + i); + mask |= byte << (32 + i); + } + } + + return val & mask; +} + +void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) +{ + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, mask); + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, mask); + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, mask); + + kvm_pmu_reprogram_counter_mask(vcpu, mask); +} + +int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.pmu.created) + return -EINVAL; + + /* + * A valid interrupt configuration for the PMU is either to have a + * properly configured interrupt number and using an in-kernel + * irqchip, or to not have an in-kernel GIC and not set an IRQ. + */ + if (irqchip_in_kernel(vcpu->kvm)) { + int irq = vcpu->arch.pmu.irq_num; + /* + * If we are using an in-kernel vgic, at this point we know + * the vgic will be initialized, so we can check the PMU irq + * number against the dimensions of the vgic and make sure + * it's valid. + */ + if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) + return -EINVAL; + } else if (kvm_arm_pmu_irq_initialized(vcpu)) { + return -EINVAL; + } + + return 0; +} + +static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) { + int ret; + + /* + * If using the PMU with an in-kernel virtual GIC + * implementation, we require the GIC to be already + * initialized when initializing the PMU. + */ + if (!vgic_initialized(vcpu->kvm)) + return -ENODEV; + + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -ENXIO; + + ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, + &vcpu->arch.pmu); + if (ret) + return ret; + } + + init_irq_work(&vcpu->arch.pmu.overflow_work, + kvm_pmu_perf_overflow_notify_vcpu); + + vcpu->arch.pmu.created = true; + return 0; +} + +/* + * For one VM the interrupt type must be same for each vcpu. + * As a PPI, the interrupt number is the same for all vcpus, + * while as an SPI it must be a separate number per vcpu. + */ +static bool pmu_irq_is_valid(struct kvm *kvm, int irq) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_arm_pmu_irq_initialized(vcpu)) + continue; + + if (irq_is_ppi(irq)) { + if (vcpu->arch.pmu.irq_num != irq) + return false; + } else { + if (vcpu->arch.pmu.irq_num == irq) + return false; + } + } + + return true; +} + +/** + * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters. + * @kvm: The kvm pointer + */ +u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; + + /* + * PMUv3 requires that all event counters are capable of counting any + * event, though the same may not be true of non-PMUv3 hardware. + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return 1; + + /* + * The arm_pmu->cntr_mask considers the fixed counter(s) as well. + * Ignore those and return only the general-purpose counters. + */ + return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); +} + +static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr) +{ + kvm->arch.nr_pmu_counters = nr; + + /* Reset MDCR_EL2.HPMN behind the vcpus' back... */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2, kvm->arch.vcpu_features)) { + struct kvm_vcpu *vcpu; + unsigned long i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2); + val &= ~MDCR_EL2_HPMN; + val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters); + __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); + } + } +} + +static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) +{ + lockdep_assert_held(&kvm->arch.config_lock); + + kvm->arch.arm_pmu = arm_pmu; + kvm_arm_set_nr_counters(kvm, kvm_arm_pmu_get_max_counters(kvm)); +} + +/** + * kvm_arm_set_default_pmu - No PMU set, get the default one. + * @kvm: The kvm pointer + * + * The observant among you will notice that the supported_cpus + * mask does not get updated for the default PMU even though it + * is quite possible the selected instance supports only a + * subset of cores in the system. This is intentional, and + * upholds the preexisting behavior on heterogeneous systems + * where vCPUs can be scheduled on any core but the guest + * counters could stop working. + */ +int kvm_arm_set_default_pmu(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu(); + + if (!arm_pmu) + return -ENODEV; + + kvm_arm_set_pmu(kvm, arm_pmu); + return 0; +} + +static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) +{ + struct kvm *kvm = vcpu->kvm; + struct arm_pmu_entry *entry; + struct arm_pmu *arm_pmu; + int ret = -ENXIO; + + lockdep_assert_held(&kvm->arch.config_lock); + mutex_lock(&arm_pmus_lock); + + list_for_each_entry(entry, &arm_pmus, entry) { + arm_pmu = entry->arm_pmu; + if (arm_pmu->pmu.type == pmu_id) { + if (kvm_vm_has_ran_once(kvm) || + (kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) { + ret = -EBUSY; + break; + } + + kvm_arm_set_pmu(kvm, arm_pmu); + cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus); + ret = 0; + break; + } + } + + mutex_unlock(&arm_pmus_lock); + return ret; +} + +static int kvm_arm_pmu_v3_set_nr_counters(struct kvm_vcpu *vcpu, unsigned int n) +{ + struct kvm *kvm = vcpu->kvm; + + if (!kvm->arch.arm_pmu) + return -EINVAL; + + if (n > kvm_arm_pmu_get_max_counters(kvm)) + return -EINVAL; + + kvm_arm_set_nr_counters(kvm, n); + return 0; +} + +int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + struct kvm *kvm = vcpu->kvm; + + lockdep_assert_held(&kvm->arch.config_lock); + + if (!kvm_vcpu_has_pmu(vcpu)) + return -ENODEV; + + if (vcpu->arch.pmu.created) + return -EBUSY; + + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: { + int __user *uaddr = (int __user *)(long)attr->addr; + int irq; + + if (!irqchip_in_kernel(kvm)) + return -EINVAL; + + if (get_user(irq, uaddr)) + return -EFAULT; + + /* The PMU overflow interrupt can be a PPI or a valid SPI. */ + if (!(irq_is_ppi(irq) || irq_is_spi(irq))) + return -EINVAL; + + if (!pmu_irq_is_valid(kvm, irq)) + return -EINVAL; + + if (kvm_arm_pmu_irq_initialized(vcpu)) + return -EBUSY; + + kvm_debug("Set kvm ARM PMU irq: %d\n", irq); + vcpu->arch.pmu.irq_num = irq; + return 0; + } + case KVM_ARM_VCPU_PMU_V3_FILTER: { + u8 pmuver = kvm_arm_pmu_get_pmuver_limit(); + struct kvm_pmu_event_filter __user *uaddr; + struct kvm_pmu_event_filter filter; + int nr_events; + + /* + * Allow userspace to specify an event filter for the entire + * event range supported by PMUVer of the hardware, rather + * than the guest's PMUVer for KVM backward compatibility. + */ + nr_events = __kvm_pmu_event_mask(pmuver) + 1; + + uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; + + if (copy_from_user(&filter, uaddr, sizeof(filter))) + return -EFAULT; + + if (((u32)filter.base_event + filter.nevents) > nr_events || + (filter.action != KVM_PMU_EVENT_ALLOW && + filter.action != KVM_PMU_EVENT_DENY)) + return -EINVAL; + + if (kvm_vm_has_ran_once(kvm)) + return -EBUSY; + + if (!kvm->arch.pmu_filter) { + kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); + if (!kvm->arch.pmu_filter) + return -ENOMEM; + + /* + * The default depends on the first applied filter. + * If it allows events, the default is to deny. + * Conversely, if the first filter denies a set of + * events, the default is to allow. + */ + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_zero(kvm->arch.pmu_filter, nr_events); + else + bitmap_fill(kvm->arch.pmu_filter, nr_events); + } + + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_set(kvm->arch.pmu_filter, filter.base_event, filter.nevents); + else + bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents); + + return 0; + } + case KVM_ARM_VCPU_PMU_V3_SET_PMU: { + int __user *uaddr = (int __user *)(long)attr->addr; + int pmu_id; + + if (get_user(pmu_id, uaddr)) + return -EFAULT; + + return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id); + } + case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: { + unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr; + unsigned int n; + + if (get_user(n, uaddr)) + return -EFAULT; + + return kvm_arm_pmu_v3_set_nr_counters(vcpu, n); + } + case KVM_ARM_VCPU_PMU_V3_INIT: + return kvm_arm_pmu_v3_init(vcpu); + } + + return -ENXIO; +} + +int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: { + int __user *uaddr = (int __user *)(long)attr->addr; + int irq; + + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; + + if (!kvm_vcpu_has_pmu(vcpu)) + return -ENODEV; + + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -ENXIO; + + irq = vcpu->arch.pmu.irq_num; + return put_user(irq, uaddr); + } + } + + return -ENXIO; +} + +int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: + case KVM_ARM_VCPU_PMU_V3_INIT: + case KVM_ARM_VCPU_PMU_V3_FILTER: + case KVM_ARM_VCPU_PMU_V3_SET_PMU: + case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: + if (kvm_vcpu_has_pmu(vcpu)) + return 0; + } + + return -ENXIO; +} + +u8 kvm_arm_pmu_get_pmuver_limit(void) +{ + unsigned int pmuver; + + pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, + read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); + + /* + * Spoof a barebones PMUv3 implementation if the system supports IMPDEF + * traps of the PMUv3 sysregs + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return ID_AA64DFR0_EL1_PMUVer_IMP; + + /* + * Otherwise, treat IMPLEMENTATION DEFINED functionality as + * unimplemented + */ + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return 0; + + return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5); +} + +/** + * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU + * @vcpu: The vcpu pointer + */ +u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) +{ + u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); + u64 n = vcpu->kvm->arch.nr_pmu_counters; + + if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu)) + n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + + return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N); +} + +void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) +{ + bool reprogrammed = false; + unsigned long mask; + int i; + + mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + for_each_set_bit(i, &mask, 32) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + + /* + * We only need to reconfigure events where the filter is + * different at EL1 vs. EL2, as we're multiplexing the true EL1 + * event filter bit for nested. + */ + if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc)) + continue; + + kvm_pmu_create_perf_event(pmc); + reprogrammed = true; + } + + if (reprogrammed) + kvm_vcpu_pmu_restore_guest(vcpu); +} diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c new file mode 100644 index 000000000000..6b48a3d16d0d --- /dev/null +++ b/arch/arm64/kvm/pmu.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Arm Limited + * Author: Andrew Murray <Andrew.Murray@arm.com> + */ +#include <linux/kvm_host.h> +#include <linux/perf_event.h> +#include <linux/perf/arm_pmu.h> +#include <linux/perf/arm_pmuv3.h> + +static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); + +/* + * Given the perf event attributes and system type, determine + * if we are going to need to switch counters at guest entry/exit. + */ +static bool kvm_pmu_switch_needed(struct perf_event_attr *attr) +{ + /** + * With VHE the guest kernel runs at EL1 and the host at EL2, + * where user (EL0) is excluded then we have no reason to switch + * counters. + */ + if (has_vhe() && attr->exclude_user) + return false; + + /* Only switch if attributes are different */ + return (attr->exclude_host != attr->exclude_guest); +} + +struct kvm_pmu_events *kvm_get_pmu_events(void) +{ + return this_cpu_ptr(&kvm_pmu_events); +} + +/* + * Add events to track that we may want to switch at guest entry/exit + * time. + */ +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) +{ + struct kvm_pmu_events *pmu = kvm_get_pmu_events(); + + if (!system_supports_pmuv3() || !kvm_pmu_switch_needed(attr)) + return; + + if (!attr->exclude_host) + pmu->events_host |= set; + if (!attr->exclude_guest) + pmu->events_guest |= set; +} + +/* + * Stop tracking events + */ +void kvm_clr_pmu_events(u64 clr) +{ + struct kvm_pmu_events *pmu = kvm_get_pmu_events(); + + if (!system_supports_pmuv3()) + return; + + pmu->events_host &= ~clr; + pmu->events_guest &= ~clr; +} + +/* + * Read a value direct from PMEVTYPER<idx> where idx is 0-30 + * or PMxCFILTR_EL0 where idx is 31-32. + */ +static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) +{ + if (idx == ARMV8_PMU_CYCLE_IDX) + return read_pmccfiltr(); + else if (idx == ARMV8_PMU_INSTR_IDX) + return read_pmicfiltr(); + + return read_pmevtypern(idx); +} + +/* + * Write a value direct to PMEVTYPER<idx> where idx is 0-30 + * or PMxCFILTR_EL0 where idx is 31-32. + */ +static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val) +{ + if (idx == ARMV8_PMU_CYCLE_IDX) + write_pmccfiltr(val); + else if (idx == ARMV8_PMU_INSTR_IDX) + write_pmicfiltr(val); + else + write_pmevtypern(idx, val); +} + +/* + * Modify ARMv8 PMU events to include EL0 counting + */ +static void kvm_vcpu_pmu_enable_el0(unsigned long events) +{ + u64 typer; + u32 counter; + + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { + typer = kvm_vcpu_pmu_read_evtype_direct(counter); + typer &= ~ARMV8_PMU_EXCLUDE_EL0; + kvm_vcpu_pmu_write_evtype_direct(counter, typer); + } +} + +/* + * Modify ARMv8 PMU events to exclude EL0 counting + */ +static void kvm_vcpu_pmu_disable_el0(unsigned long events) +{ + u64 typer; + u32 counter; + + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { + typer = kvm_vcpu_pmu_read_evtype_direct(counter); + typer |= ARMV8_PMU_EXCLUDE_EL0; + kvm_vcpu_pmu_write_evtype_direct(counter, typer); + } +} + +/* + * On VHE ensure that only guest events have EL0 counting enabled. + * This is called from both vcpu_{load,put} and the sysreg handling. + * Since the latter is preemptible, special care must be taken to + * disable preemption. + */ +void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu_events *pmu; + u64 events_guest, events_host; + + if (!system_supports_pmuv3() || !has_vhe()) + return; + + preempt_disable(); + pmu = kvm_get_pmu_events(); + events_guest = pmu->events_guest; + events_host = pmu->events_host; + + kvm_vcpu_pmu_enable_el0(events_guest); + kvm_vcpu_pmu_disable_el0(events_host); + preempt_enable(); +} + +/* + * On VHE ensure that only host events have EL0 counting enabled + */ +void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu_events *pmu; + u64 events_guest, events_host; + + if (!system_supports_pmuv3() || !has_vhe()) + return; + + pmu = kvm_get_pmu_events(); + events_guest = pmu->events_guest; + events_host = pmu->events_host; + + kvm_vcpu_pmu_enable_el0(events_host); + kvm_vcpu_pmu_disable_el0(events_guest); +} + +/* + * With VHE, keep track of the PMUSERENR_EL0 value for the host EL0 on the pCPU + * where PMUSERENR_EL0 for the guest is loaded, since PMUSERENR_EL0 is switched + * to the value for the guest on vcpu_load(). The value for the host EL0 + * will be restored on vcpu_put(), before returning to userspace. + * This isn't necessary for nVHE, as the register is context switched for + * every guest enter/exit. + * + * Return true if KVM takes care of the register. Otherwise return false. + */ +bool kvm_set_pmuserenr(u64 val) +{ + struct kvm_cpu_context *hctxt; + struct kvm_vcpu *vcpu; + + if (!system_supports_pmuv3() || !has_vhe()) + return false; + + vcpu = kvm_get_running_vcpu(); + if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU)) + return false; + + hctxt = host_data_ptr(host_ctxt); + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val; + return true; +} + +/* + * If we interrupted the guest to update the host PMU context, make + * sure we re-apply the guest EL0 state. + */ +void kvm_vcpu_pmu_resync_el0(void) +{ + struct kvm_vcpu *vcpu; + + if (!has_vhe() || !in_interrupt()) + return; + + vcpu = kvm_get_running_vcpu(); + if (!vcpu) + return; + + kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu); +} diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c new file mode 100644 index 000000000000..3b5dbe9a0a0e --- /dev/null +++ b/arch/arm64/kvm/psci.c @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/arm-smccc.h> +#include <linux/preempt.h> +#include <linux/kvm_host.h> +#include <linux/uaccess.h> +#include <linux/wait.h> + +#include <asm/cputype.h> +#include <asm/kvm_emulate.h> + +#include <kvm/arm_psci.h> +#include <kvm/arm_hypercalls.h> + +/* + * This is an implementation of the Power State Coordination Interface + * as described in ARM document number ARM DEN 0022A. + */ + +#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) + +static unsigned long psci_affinity_mask(unsigned long affinity_level) +{ + if (affinity_level <= 3) + return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level); + + return 0; +} + +static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + /* + * NOTE: For simplicity, we make VCPU suspend emulation to be + * same-as WFI (Wait-for-interrupt) emulation. + * + * This means for KVM the wakeup events are interrupts and + * this is consistent with intended use of StateID as described + * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A). + * + * Further, we also treat power-down request to be same as + * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2 + * specification (ARM DEN 0022A). This means all suspend states + * for KVM will preserve the register state. + */ + kvm_vcpu_wfi(vcpu); + + return PSCI_RET_SUCCESS; +} + +static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu, + unsigned long affinity) +{ + return !(affinity & ~MPIDR_HWID_BITMASK); +} + +static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) +{ + struct vcpu_reset_state *reset_state; + struct kvm *kvm = source_vcpu->kvm; + struct kvm_vcpu *vcpu = NULL; + int ret = PSCI_RET_SUCCESS; + unsigned long cpu_id; + + cpu_id = smccc_get_arg1(source_vcpu); + if (!kvm_psci_valid_affinity(source_vcpu, cpu_id)) + return PSCI_RET_INVALID_PARAMS; + + vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); + + /* + * Make sure the caller requested a valid CPU and that the CPU is + * turned off. + */ + if (!vcpu) + return PSCI_RET_INVALID_PARAMS; + + spin_lock(&vcpu->arch.mp_state_lock); + if (!kvm_arm_vcpu_stopped(vcpu)) { + if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) + ret = PSCI_RET_ALREADY_ON; + else + ret = PSCI_RET_INVALID_PARAMS; + + goto out_unlock; + } + + reset_state = &vcpu->arch.reset_state; + + reset_state->pc = smccc_get_arg2(source_vcpu); + + /* Propagate caller endianness */ + reset_state->be = kvm_vcpu_is_be(source_vcpu); + + /* + * NOTE: We always update r0 (or x0) because for PSCI v0.1 + * the general purpose registers are undefined upon CPU_ON. + */ + reset_state->r0 = smccc_get_arg3(source_vcpu); + + reset_state->reset = true; + kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); + + /* + * Make sure the reset request is observed if the RUNNABLE mp_state is + * observed. + */ + smp_wmb(); + + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); + kvm_vcpu_wake_up(vcpu); + +out_unlock: + spin_unlock(&vcpu->arch.mp_state_lock); + return ret; +} + +static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) +{ + int matching_cpus = 0; + unsigned long i, mpidr; + unsigned long target_affinity; + unsigned long target_affinity_mask; + unsigned long lowest_affinity_level; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tmp; + + target_affinity = smccc_get_arg1(vcpu); + lowest_affinity_level = smccc_get_arg2(vcpu); + + if (!kvm_psci_valid_affinity(vcpu, target_affinity)) + return PSCI_RET_INVALID_PARAMS; + + /* Determine target affinity mask */ + target_affinity_mask = psci_affinity_mask(lowest_affinity_level); + if (!target_affinity_mask) + return PSCI_RET_INVALID_PARAMS; + + /* Ignore other bits of target affinity */ + target_affinity &= target_affinity_mask; + + /* + * If one or more VCPU matching target affinity are running + * then ON else OFF + */ + kvm_for_each_vcpu(i, tmp, kvm) { + mpidr = kvm_vcpu_get_mpidr_aff(tmp); + if ((mpidr & target_affinity_mask) == target_affinity) { + matching_cpus++; + if (!kvm_arm_vcpu_stopped(tmp)) + return PSCI_0_2_AFFINITY_LEVEL_ON; + } + } + + if (!matching_cpus) + return PSCI_RET_INVALID_PARAMS; + + return PSCI_0_2_AFFINITY_LEVEL_OFF; +} + +static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags) +{ + unsigned long i; + struct kvm_vcpu *tmp; + + /* + * The KVM ABI specifies that a system event exit may call KVM_RUN + * again and may perform shutdown/reboot at a later time that when the + * actual request is made. Since we are implementing PSCI and a + * caller of PSCI reboot and shutdown expects that the system shuts + * down or reboots immediately, let's make sure that VCPUs are not run + * after this call is handled and before the VCPUs have been + * re-initialized. + */ + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { + spin_lock(&tmp->arch.mp_state_lock); + WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); + spin_unlock(&tmp->arch.mp_state_lock); + } + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); + + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); + vcpu->run->system_event.type = type; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = flags; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; +} + +static void kvm_psci_system_off(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, 0); +} + +static void kvm_psci_system_off2(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, + KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2); +} + +static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, 0); +} + +static void kvm_psci_system_reset2(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, + KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2); +} + +static void kvm_psci_system_suspend(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + memset(&run->system_event, 0, sizeof(vcpu->run->system_event)); + run->system_event.type = KVM_SYSTEM_EVENT_SUSPEND; + run->exit_reason = KVM_EXIT_SYSTEM_EVENT; +} + +static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) +{ + int i; + + /* + * Zero the input registers' upper 32 bits. They will be fully + * zeroed on exit, so we're fine changing them in place. + */ + for (i = 1; i < 4; i++) + vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i))); +} + +static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn) +{ + /* + * Prevent 32 bit guests from calling 64 bit PSCI functions. + */ + if ((fn & PSCI_0_2_64BIT) && vcpu_mode_is_32bit(vcpu)) + return PSCI_RET_NOT_SUPPORTED; + + return 0; +} + +static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) +{ + u32 psci_fn = smccc_get_function(vcpu); + unsigned long val; + int ret = 1; + + switch (psci_fn) { + case PSCI_0_2_FN_PSCI_VERSION: + /* + * Bits[31:16] = Major Version = 0 + * Bits[15:0] = Minor Version = 2 + */ + val = KVM_ARM_PSCI_0_2; + break; + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + val = kvm_psci_vcpu_suspend(vcpu); + break; + case PSCI_0_2_FN_CPU_OFF: + kvm_arm_vcpu_power_off(vcpu); + val = PSCI_RET_SUCCESS; + break; + case PSCI_0_2_FN_CPU_ON: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_0_2_FN64_CPU_ON: + val = kvm_psci_vcpu_on(vcpu); + break; + case PSCI_0_2_FN_AFFINITY_INFO: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_0_2_FN64_AFFINITY_INFO: + val = kvm_psci_vcpu_affinity_info(vcpu); + break; + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: + /* + * Trusted OS is MP hence does not require migration + * or + * Trusted OS is not present + */ + val = PSCI_0_2_TOS_MP; + break; + case PSCI_0_2_FN_SYSTEM_OFF: + kvm_psci_system_off(vcpu); + /* + * We shouldn't be going back to guest VCPU after + * receiving SYSTEM_OFF request. + * + * If user space accidentally/deliberately resumes + * guest VCPU after SYSTEM_OFF request then guest + * VCPU should see internal failure from PSCI return + * value. To achieve this, we preload r0 (or x0) with + * PSCI return value INTERNAL_FAILURE. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; + case PSCI_0_2_FN_SYSTEM_RESET: + kvm_psci_system_reset(vcpu); + /* + * Same reason as SYSTEM_OFF for preloading r0 (or x0) + * with PSCI return value INTERNAL_FAILURE. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return ret; +} + +static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) +{ + unsigned long val = PSCI_RET_NOT_SUPPORTED; + u32 psci_fn = smccc_get_function(vcpu); + struct kvm *kvm = vcpu->kvm; + u32 arg; + int ret = 1; + + switch(psci_fn) { + case PSCI_0_2_FN_PSCI_VERSION: + val = PSCI_VERSION(1, minor); + break; + case PSCI_1_0_FN_PSCI_FEATURES: + arg = smccc_get_arg1(vcpu); + val = kvm_psci_check_allowed_function(vcpu, arg); + if (val) + break; + + val = PSCI_RET_NOT_SUPPORTED; + + switch(arg) { + case PSCI_0_2_FN_PSCI_VERSION: + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + case PSCI_0_2_FN_CPU_OFF: + case PSCI_0_2_FN_CPU_ON: + case PSCI_0_2_FN64_CPU_ON: + case PSCI_0_2_FN_AFFINITY_INFO: + case PSCI_0_2_FN64_AFFINITY_INFO: + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: + case PSCI_0_2_FN_SYSTEM_OFF: + case PSCI_0_2_FN_SYSTEM_RESET: + case PSCI_1_0_FN_PSCI_FEATURES: + case ARM_SMCCC_VERSION_FUNC_ID: + val = 0; + break; + case PSCI_1_0_FN_SYSTEM_SUSPEND: + case PSCI_1_0_FN64_SYSTEM_SUSPEND: + if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags)) + val = 0; + break; + case PSCI_1_1_FN_SYSTEM_RESET2: + case PSCI_1_1_FN64_SYSTEM_RESET2: + if (minor >= 1) + val = 0; + break; + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor >= 3) + val = PSCI_1_3_OFF_TYPE_HIBERNATE_OFF; + break; + } + break; + case PSCI_1_0_FN_SYSTEM_SUSPEND: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_0_FN64_SYSTEM_SUSPEND: + /* + * Return directly to userspace without changing the vCPU's + * registers. Userspace depends on reading the SMCCC parameters + * to implement SYSTEM_SUSPEND. + */ + if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags)) { + kvm_psci_system_suspend(vcpu); + return 0; + } + break; + case PSCI_1_1_FN_SYSTEM_RESET2: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_1_FN64_SYSTEM_RESET2: + if (minor >= 1) { + arg = smccc_get_arg1(vcpu); + + if (arg <= PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET || + arg >= PSCI_1_1_RESET_TYPE_VENDOR_START) { + kvm_psci_system_reset2(vcpu); + vcpu_set_reg(vcpu, 0, PSCI_RET_INTERNAL_FAILURE); + return 0; + } + + val = PSCI_RET_INVALID_PARAMS; + break; + } + break; + case PSCI_1_3_FN_SYSTEM_OFF2: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor < 3) + break; + + arg = smccc_get_arg1(vcpu); + /* + * SYSTEM_OFF2 defaults to HIBERNATE_OFF if arg1 is zero. arg2 + * must be zero. + */ + if ((arg && arg != PSCI_1_3_OFF_TYPE_HIBERNATE_OFF) || + smccc_get_arg2(vcpu) != 0) { + val = PSCI_RET_INVALID_PARAMS; + break; + } + kvm_psci_system_off2(vcpu); + /* + * We shouldn't be going back to the guest after receiving a + * SYSTEM_OFF2 request. Preload a return value of + * INTERNAL_FAILURE should userspace ignore the exit and resume + * the vCPU. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; + default: + return kvm_psci_0_2_call(vcpu); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return ret; +} + +static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) +{ + u32 psci_fn = smccc_get_function(vcpu); + unsigned long val; + + switch (psci_fn) { + case KVM_PSCI_FN_CPU_OFF: + kvm_arm_vcpu_power_off(vcpu); + val = PSCI_RET_SUCCESS; + break; + case KVM_PSCI_FN_CPU_ON: + val = kvm_psci_vcpu_on(vcpu); + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} + +/** + * kvm_psci_call - handle PSCI call if r0 value is in range + * @vcpu: Pointer to the VCPU struct + * + * Handle PSCI calls from guests through traps from HVC instructions. + * The calling convention is similar to SMC calls to the secure world + * where the function number is placed in r0. + * + * This function returns: > 0 (success), 0 (success but exit to user + * space), and < 0 (errors) + * + * Errors: + * -EINVAL: Unrecognized PSCI function + */ +int kvm_psci_call(struct kvm_vcpu *vcpu) +{ + u32 psci_fn = smccc_get_function(vcpu); + int version = kvm_psci_version(vcpu); + unsigned long val; + + val = kvm_psci_check_allowed_function(vcpu, psci_fn); + if (val) { + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; + } + + switch (version) { + case KVM_ARM_PSCI_1_3: + return kvm_psci_1_x_call(vcpu, 3); + case KVM_ARM_PSCI_1_2: + return kvm_psci_1_x_call(vcpu, 2); + case KVM_ARM_PSCI_1_1: + return kvm_psci_1_x_call(vcpu, 1); + case KVM_ARM_PSCI_1_0: + return kvm_psci_1_x_call(vcpu, 0); + case KVM_ARM_PSCI_0_2: + return kvm_psci_0_2_call(vcpu); + case KVM_ARM_PSCI_0_1: + return kvm_psci_0_1_call(vcpu); + default: + WARN_ONCE(1, "Unknown PSCI version %d", version); + smccc_set_retval(vcpu, SMCCC_RET_NOT_SUPPORTED, 0, 0, 0); + return 1; + } +} diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c new file mode 100644 index 000000000000..6cbe018fd6fd --- /dev/null +++ b/arch/arm64/kvm/ptdump.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Debug helper used to dump the stage-2 pagetables of the system and their + * associated permissions. + * + * Copyright (C) Google, 2024 + * Author: Sebastian Ene <sebastianene@google.com> + */ +#include <linux/debugfs.h> +#include <linux/kvm_host.h> +#include <linux/seq_file.h> + +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/ptdump.h> + +#define MARKERS_LEN 2 +#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1) + +struct kvm_ptdump_guest_state { + struct kvm *kvm; + struct ptdump_pg_state parser_state; + struct addr_marker ipa_marker[MARKERS_LEN]; + struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS]; + struct ptdump_range range[MARKERS_LEN]; +}; + +static const struct ptdump_prot_bits stage2_pte_bits[] = { + { + .mask = PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "F", + }, + { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, + .set = "R", + .clear = " ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, + .set = "W", + .clear = " ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b00UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px ux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b01UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b10UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNUXN", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b11UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px UXN", + }, + { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF, + .val = KVM_PTE_LEAF_ATTR_LO_S2_AF, + .set = "AF", + .clear = " ", + }, + { + .mask = PMD_TYPE_MASK, + .val = PMD_TYPE_SECT, + .set = "BLK", + .clear = " ", + }, +}; + +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct ptdump_pg_state *st = ctx->arg; + struct ptdump_state *pt_st = &st->ptdump; + + note_page(pt_st, ctx->addr, ctx->level, ctx->old); + + return 0; +} + +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl) +{ + u32 i; + u64 mask; + + if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + mask = 0; + for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++) + mask |= stage2_pte_bits[i].mask; + + for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) { + snprintf(level[i].name, sizeof(level[i].name), "%u", i); + + level[i].num = ARRAY_SIZE(stage2_pte_bits); + level[i].bits = stage2_pte_bits; + level[i].mask = mask; + } + + return 0; +} + +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm) +{ + struct kvm_ptdump_guest_state *st; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct kvm_pgtable *pgtable = mmu->pgt; + int ret; + + st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT); + if (!st) + return ERR_PTR(-ENOMEM); + + ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level); + if (ret) { + kfree(st); + return ERR_PTR(ret); + } + + st->ipa_marker[0].name = "Guest IPA"; + st->ipa_marker[1].start_address = BIT(pgtable->ia_bits); + st->range[0].end = BIT(pgtable->ia_bits); + + st->kvm = kvm; + st->parser_state = (struct ptdump_pg_state) { + .marker = &st->ipa_marker[0], + .level = -1, + .pg_level = &st->level[0], + .ptdump.range = &st->range[0], + .start_address = 0, + }; + + return st; +} + +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused) +{ + int ret; + struct kvm_ptdump_guest_state *st = m->private; + struct kvm *kvm = st->kvm; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct ptdump_pg_state *parser_state = &st->parser_state; + struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) { + .cb = kvm_ptdump_visitor, + .arg = parser_state, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + parser_state->seq = m; + + write_lock(&kvm->mmu_lock); + ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker); + write_unlock(&kvm->mmu_lock); + + return ret; +} + +static int kvm_ptdump_guest_open(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + struct kvm_ptdump_guest_state *st; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + st = kvm_ptdump_parser_create(kvm); + if (IS_ERR(st)) { + ret = PTR_ERR(st); + goto err_with_kvm_ref; + } + + ret = single_open(file, kvm_ptdump_guest_show, st); + if (!ret) + return 0; + + kfree(st); +err_with_kvm_ref: + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_ptdump_guest_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + void *st = ((struct seq_file *)file->private_data)->private; + + kfree(st); + kvm_put_kvm(kvm); + + return single_release(m, file); +} + +static const struct file_operations kvm_ptdump_guest_fops = { + .open = kvm_ptdump_guest_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_ptdump_guest_close, +}; + +static int kvm_pgtable_range_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%2u\n", pgtable->ia_bits); + return 0; +} + +static int kvm_pgtable_levels_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level); + return 0; +} + +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + struct kvm *kvm = m->i_private; + struct kvm_pgtable *pgtable; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + pgtable = kvm->arch.mmu.pgt; + + ret = single_open(file, show, pgtable); + if (ret < 0) + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_pgtable_range_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show); +} + +static int kvm_pgtable_levels_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show); +} + +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + + kvm_put_kvm(kvm); + return single_release(m, file); +} + +static const struct file_operations kvm_pgtable_range_fops = { + .open = kvm_pgtable_range_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +static const struct file_operations kvm_pgtable_levels_fops = { + .open = kvm_pgtable_levels_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) +{ + debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry, + kvm, &kvm_ptdump_guest_fops); + debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm, + &kvm_pgtable_range_fops); + debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry, + kvm, &kvm_pgtable_levels_fops); +} diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c new file mode 100644 index 000000000000..4ceabaa4c30b --- /dev/null +++ b/arch/arm64/kvm/pvtime.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Arm Ltd. + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> +#include <linux/sched/stat.h> + +#include <asm/kvm_mmu.h> +#include <asm/pvclock-abi.h> + +#include <kvm/arm_hypercalls.h> + +void kvm_update_stolen_time(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 base = vcpu->arch.steal.base; + u64 last_steal = vcpu->arch.steal.last_steal; + u64 offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); + u64 steal = 0; + int idx; + + if (base == INVALID_GPA) + return; + + idx = srcu_read_lock(&kvm->srcu); + if (!kvm_get_guest(kvm, base + offset, steal)) { + steal = le64_to_cpu(steal); + vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); + steal += vcpu->arch.steal.last_steal - last_steal; + kvm_put_guest(kvm, base + offset, cpu_to_le64(steal)); + } + srcu_read_unlock(&kvm->srcu, idx); +} + +long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) +{ + u32 feature = smccc_get_arg1(vcpu); + long val = SMCCC_RET_NOT_SUPPORTED; + + switch (feature) { + case ARM_SMCCC_HV_PV_TIME_FEATURES: + case ARM_SMCCC_HV_PV_TIME_ST: + if (vcpu->arch.steal.base != INVALID_GPA) + val = SMCCC_RET_SUCCESS; + break; + } + + return val; +} + +gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) +{ + struct pvclock_vcpu_stolen_time init_values = {}; + struct kvm *kvm = vcpu->kvm; + u64 base = vcpu->arch.steal.base; + + if (base == INVALID_GPA) + return base; + + /* + * Start counting stolen time from the time the guest requests + * the feature enabled. + */ + vcpu->arch.steal.last_steal = current->sched_info.run_delay; + kvm_write_guest_lock(kvm, base, &init_values, sizeof(init_values)); + + return base; +} + +bool kvm_arm_pvtime_supported(void) +{ + return !!sched_info_on(); +} + +int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *user = (u64 __user *)attr->addr; + struct kvm *kvm = vcpu->kvm; + u64 ipa; + int ret = 0; + int idx; + + if (!kvm_arm_pvtime_supported() || + attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + return -ENXIO; + + if (get_user(ipa, user)) + return -EFAULT; + if (!IS_ALIGNED(ipa, 64)) + return -EINVAL; + if (vcpu->arch.steal.base != INVALID_GPA) + return -EEXIST; + + /* Check the address is in a valid memslot */ + idx = srcu_read_lock(&kvm->srcu); + if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT))) + ret = -EINVAL; + srcu_read_unlock(&kvm->srcu, idx); + + if (!ret) + vcpu->arch.steal.base = ipa; + + return ret; +} + +int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *user = (u64 __user *)attr->addr; + u64 ipa; + + if (!kvm_arm_pvtime_supported() || + attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + return -ENXIO; + + ipa = vcpu->arch.steal.base; + + if (put_user(ipa, user)) + return -EFAULT; + return 0; +} + +int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PVTIME_IPA: + if (kvm_arm_pvtime_supported()) + return 0; + } + return -ENXIO; +} diff --git a/arch/arm64/kvm/regmap.c b/arch/arm64/kvm/regmap.c deleted file mode 100644 index bbc6ae32e4af..000000000000 --- a/arch/arm64/kvm/regmap.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * Derived from arch/arm/kvm/emulate.c: - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/mm.h> -#include <linux/kvm_host.h> -#include <asm/kvm_emulate.h> -#include <asm/ptrace.h> - -#define VCPU_NR_MODES 6 -#define REG_OFFSET(_reg) \ - (offsetof(struct user_pt_regs, _reg) / sizeof(unsigned long)) - -#define USR_REG_OFFSET(R) REG_OFFSET(compat_usr(R)) - -static const unsigned long vcpu_reg_offsets[VCPU_NR_MODES][16] = { - /* USR Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), USR_REG_OFFSET(13), USR_REG_OFFSET(14), - REG_OFFSET(pc) - }, - - /* FIQ Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), - REG_OFFSET(compat_r8_fiq), /* r8 */ - REG_OFFSET(compat_r9_fiq), /* r9 */ - REG_OFFSET(compat_r10_fiq), /* r10 */ - REG_OFFSET(compat_r11_fiq), /* r11 */ - REG_OFFSET(compat_r12_fiq), /* r12 */ - REG_OFFSET(compat_sp_fiq), /* r13 */ - REG_OFFSET(compat_lr_fiq), /* r14 */ - REG_OFFSET(pc) - }, - - /* IRQ Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(compat_sp_irq), /* r13 */ - REG_OFFSET(compat_lr_irq), /* r14 */ - REG_OFFSET(pc) - }, - - /* SVC Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(compat_sp_svc), /* r13 */ - REG_OFFSET(compat_lr_svc), /* r14 */ - REG_OFFSET(pc) - }, - - /* ABT Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(compat_sp_abt), /* r13 */ - REG_OFFSET(compat_lr_abt), /* r14 */ - REG_OFFSET(pc) - }, - - /* UND Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(compat_sp_und), /* r13 */ - REG_OFFSET(compat_lr_und), /* r14 */ - REG_OFFSET(pc) - }, -}; - -/* - * Return a pointer to the register number valid in the current mode of - * the virtual CPU. - */ -unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num) -{ - unsigned long *reg_array = (unsigned long *)&vcpu->arch.ctxt.gp_regs.regs; - unsigned long mode = *vcpu_cpsr(vcpu) & COMPAT_PSR_MODE_MASK; - - switch (mode) { - case COMPAT_PSR_MODE_USR ... COMPAT_PSR_MODE_SVC: - mode &= ~PSR_MODE32_BIT; /* 0 ... 3 */ - break; - - case COMPAT_PSR_MODE_ABT: - mode = 4; - break; - - case COMPAT_PSR_MODE_UND: - mode = 5; - break; - - case COMPAT_PSR_MODE_SYS: - mode = 0; /* SYS maps to USR */ - break; - - default: - BUG(); - } - - return reg_array + vcpu_reg_offsets[mode][reg_num]; -} - -/* - * Return the SPSR for the current mode of the virtual CPU. - */ -unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu) -{ - unsigned long mode = *vcpu_cpsr(vcpu) & COMPAT_PSR_MODE_MASK; - switch (mode) { - case COMPAT_PSR_MODE_SVC: - mode = KVM_SPSR_SVC; - break; - case COMPAT_PSR_MODE_ABT: - mode = KVM_SPSR_ABT; - break; - case COMPAT_PSR_MODE_UND: - mode = KVM_SPSR_UND; - break; - case COMPAT_PSR_MODE_IRQ: - mode = KVM_SPSR_IRQ; - break; - case COMPAT_PSR_MODE_FIQ: - mode = KVM_SPSR_FIQ; - break; - default: - BUG(); - } - - return (unsigned long *)&vcpu_gp_regs(vcpu)->spsr[mode]; -} diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 70a7816535cd..959532422d3a 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> @@ -5,108 +6,308 @@ * Derived from arch/arm/kvm/reset.c * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/errno.h> +#include <linux/kernel.h> #include <linux/kvm_host.h> #include <linux/kvm.h> +#include <linux/hw_breakpoint.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/types.h> #include <kvm/arm_arch_timer.h> +#include <asm/cpufeature.h> #include <asm/cputype.h> +#include <asm/fpsimd.h> #include <asm/ptrace.h> #include <asm/kvm_arm.h> -#include <asm/kvm_coproc.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> +#include <asm/virt.h> + +/* Maximum phys_shift supported for any VM on this host */ +static u32 __ro_after_init kvm_ipa_limit; +unsigned int __ro_after_init kvm_host_sve_max_vl; /* * ARMv8 Reset Values */ -static const struct kvm_regs default_regs_reset = { - .regs.pstate = (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | - PSR_F_BIT | PSR_D_BIT), -}; - -static const struct kvm_regs default_regs_reset32 = { - .regs.pstate = (COMPAT_PSR_MODE_SVC | COMPAT_PSR_A_BIT | - COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT), -}; - -static const struct kvm_irq_level default_vtimer_irq = { - .irq = 27, - .level = 1, -}; - -static bool cpu_has_32bit_el1(void) +#define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \ + PSR_F_BIT | PSR_D_BIT) + +#define VCPU_RESET_PSTATE_EL2 (PSR_MODE_EL2h | PSR_A_BIT | PSR_I_BIT | \ + PSR_F_BIT | PSR_D_BIT) + +#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ + PSR_AA32_I_BIT | PSR_AA32_F_BIT) + +unsigned int __ro_after_init kvm_sve_max_vl; + +int __init kvm_arm_init_sve(void) { - u64 pfr0; + if (system_supports_sve()) { + kvm_sve_max_vl = sve_max_virtualisable_vl(); + kvm_host_sve_max_vl = sve_max_vl(); + kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl; + + /* + * The get_sve_reg()/set_sve_reg() ioctl interface will need + * to be extended with multiple register slice support in + * order to support vector lengths greater than + * VL_ARCH_MAX: + */ + if (WARN_ON(kvm_sve_max_vl > VL_ARCH_MAX)) + kvm_sve_max_vl = VL_ARCH_MAX; - pfr0 = read_cpuid(ID_AA64PFR0_EL1); - return !!(pfr0 & 0x20); + /* + * Don't even try to make use of vector lengths that + * aren't available on all CPUs, for now: + */ + if (kvm_sve_max_vl < sve_max_vl()) + pr_warn("KVM: SVE vector length for guests limited to %u bytes\n", + kvm_sve_max_vl); + } + + return 0; } -int kvm_arch_dev_ioctl_check_extension(long ext) +static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) { - int r; + vcpu->arch.sve_max_vl = kvm_sve_max_vl; - switch (ext) { - case KVM_CAP_ARM_EL1_32BIT: - r = cpu_has_32bit_el1(); - break; - default: - r = 0; + /* + * Userspace can still customize the vector lengths by writing + * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until + * kvm_arm_vcpu_finalize(), which freezes the configuration. + */ + set_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &vcpu->kvm->arch.flags); +} + +/* + * Finalize vcpu's maximum SVE vector length, allocating + * vcpu->arch.sve_state as necessary. + */ +static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) +{ + void *buf; + unsigned int vl; + size_t reg_sz; + int ret; + + vl = vcpu->arch.sve_max_vl; + + /* + * Responsibility for these properties is shared between + * kvm_arm_init_sve(), kvm_vcpu_enable_sve() and + * set_sve_vls(). Double-check here just to be sure: + */ + if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl() || + vl > VL_ARCH_MAX)) + return -EIO; + + reg_sz = vcpu_sve_state_size(vcpu); + buf = kzalloc(reg_sz, GFP_KERNEL_ACCOUNT); + if (!buf) + return -ENOMEM; + + ret = kvm_share_hyp(buf, buf + reg_sz); + if (ret) { + kfree(buf); + return ret; } + + vcpu->arch.sve_state = buf; + vcpu_set_flag(vcpu, VCPU_SVE_FINALIZED); + return 0; +} + +int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature) +{ + switch (feature) { + case KVM_ARM_VCPU_SVE: + if (!vcpu_has_sve(vcpu)) + return -EINVAL; + + if (kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; + + return kvm_vcpu_finalize_sve(vcpu); + } + + return -EINVAL; +} + +bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_sve(vcpu) && !kvm_arm_vcpu_sve_finalized(vcpu)) + return false; + + return true; +} + +void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + void *sve_state = vcpu->arch.sve_state; - return r; + kvm_unshare_hyp(vcpu, vcpu + 1); + if (sve_state) + kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); + kfree(sve_state); + free_page((unsigned long)vcpu->arch.ctxt.vncr_array); + kfree(vcpu->arch.vncr_tlb); + kfree(vcpu->arch.ccsidr); +} + +static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_sve(vcpu)) + memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); } /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer * - * This function finds the right table above and sets the registers on - * the virtual CPU struct to their architectually defined reset - * values. + * This function sets the registers on the virtual CPU struct to their + * architecturally defined reset values, except for registers whose reset is + * deferred until kvm_arm_vcpu_finalize(). + * + * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT + * ioctl or as part of handling a request issued by another VCPU in the PSCI + * handling code. In the first case, the VCPU will not be loaded, and in the + * second case the VCPU will be loaded. Because this function operates purely + * on the memory-backed values of system registers, we want to do a full put if + * we were loaded (handling a request) and load the values back at the end of + * the function. Otherwise we leave the state alone. In both cases, we + * disable preemption around the vcpu reset as we would otherwise race with + * preempt notifiers which also call put/load. */ -int kvm_reset_vcpu(struct kvm_vcpu *vcpu) +void kvm_reset_vcpu(struct kvm_vcpu *vcpu) { - const struct kvm_irq_level *cpu_vtimer_irq; - const struct kvm_regs *cpu_reset; + struct vcpu_reset_state reset_state; + bool loaded; + u32 pstate; - switch (vcpu->arch.target) { - default: - if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { - if (!cpu_has_32bit_el1()) - return -EINVAL; - cpu_reset = &default_regs_reset32; - vcpu->arch.hcr_el2 &= ~HCR_RW; - } else { - cpu_reset = &default_regs_reset; - } + spin_lock(&vcpu->arch.mp_state_lock); + reset_state = vcpu->arch.reset_state; + vcpu->arch.reset_state.reset = false; + spin_unlock(&vcpu->arch.mp_state_lock); - cpu_vtimer_irq = &default_vtimer_irq; - break; + preempt_disable(); + loaded = (vcpu->cpu != -1); + if (loaded) + kvm_arch_vcpu_put(vcpu); + + if (!kvm_arm_vcpu_sve_finalized(vcpu)) { + if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) + kvm_vcpu_enable_sve(vcpu); + } else { + kvm_vcpu_reset_sve(vcpu); } + if (vcpu_el1_is_32bit(vcpu)) + pstate = VCPU_RESET_PSTATE_SVC; + else if (vcpu_has_nv(vcpu)) + pstate = VCPU_RESET_PSTATE_EL2; + else + pstate = VCPU_RESET_PSTATE_EL1; + /* Reset core registers */ - memcpy(vcpu_gp_regs(vcpu), cpu_reset, sizeof(*cpu_reset)); + memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); + memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs)); + vcpu->arch.ctxt.spsr_abt = 0; + vcpu->arch.ctxt.spsr_und = 0; + vcpu->arch.ctxt.spsr_irq = 0; + vcpu->arch.ctxt.spsr_fiq = 0; + vcpu_gp_regs(vcpu)->pstate = pstate; /* Reset system registers */ kvm_reset_sys_regs(vcpu); + /* + * Additional reset state handling that PSCI may have imposed on us. + * Must be done after all the sys_reg reset. + */ + if (reset_state.reset) { + unsigned long target_pc = reset_state.pc; + + /* Gracefully handle Thumb2 entry point */ + if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { + target_pc &= ~1UL; + vcpu_set_thumb(vcpu); + } + + /* Propagate caller endianness */ + if (reset_state.be) + kvm_vcpu_set_be(vcpu); + + *vcpu_pc(vcpu) = target_pc; + vcpu_set_reg(vcpu, 0, reset_state.r0); + } + /* Reset timer */ - kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); + kvm_timer_vcpu_reset(vcpu); + + if (loaded) + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); +} + +u32 kvm_get_pa_bits(struct kvm *kvm) +{ + /* Fixed limit until we can configure ID_AA64MMFR0.PARange */ + return kvm_ipa_limit; +} + +u32 get_kvm_ipa_limit(void) +{ + return kvm_ipa_limit; +} + +int __init kvm_set_ipa_limit(void) +{ + unsigned int parange; + u64 mmfr0; + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + /* + * IPA size beyond 48 bits for 4K and 16K page size is only supported + * when LPA2 is available. So if we have LPA2, enable it, else cap to 48 + * bits, in case it's reported as larger on the system. + */ + if (!kvm_lpa2_is_enabled() && PAGE_SIZE != SZ_64K) + parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48); + + /* + * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at + * Stage-2. If not, things will stop very quickly. + */ + switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN_2_SHIFT)) { + case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE: + kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); + return -EINVAL; + case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT: + kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n"); + break; + case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN ... ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX: + kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n"); + break; + default: + kvm_err("Unsupported value for TGRAN_2, giving up\n"); + return -EINVAL; + } + + kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); + kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, + ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? + " (Reduced IPA size, limited VM/VMM compatibility)" : "")); return 0; } diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c new file mode 100644 index 000000000000..af5eec681127 --- /dev/null +++ b/arch/arm64/kvm/stacktrace.c @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * + * Copyright (C) 2022 Google LLC + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_mmu.h> +#include <asm/stacktrace/nvhe.h> + +static struct stack_info stackinfo_get_overflow(void) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base; + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} + +static struct stack_info stackinfo_get_overflow_kern_va(void) +{ + unsigned long low = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} + +static struct stack_info stackinfo_get_hyp(void) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->stack_base; + unsigned long high = low + NVHE_STACK_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} + +static struct stack_info stackinfo_get_hyp_kern_va(void) +{ + unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_base); + unsigned long high = low + NVHE_STACK_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} + +/* + * kvm_nvhe_stack_kern_va - Convert KVM nVHE HYP stack addresses to a kernel VAs + * + * The nVHE hypervisor stack is mapped in the flexible 'private' VA range, to + * allow for guard pages below the stack. Consequently, the fixed offset address + * translation macros won't work here. + * + * The kernel VA is calculated as an offset from the kernel VA of the hypervisor + * stack base. + * + * Returns true on success and updates @addr to its corresponding kernel VA; + * otherwise returns false. + */ +static bool kvm_nvhe_stack_kern_va(unsigned long *addr, unsigned long size) +{ + struct stack_info stack_hyp, stack_kern; + + stack_hyp = stackinfo_get_hyp(); + stack_kern = stackinfo_get_hyp_kern_va(); + if (stackinfo_on_stack(&stack_hyp, *addr, size)) + goto found; + + stack_hyp = stackinfo_get_overflow(); + stack_kern = stackinfo_get_overflow_kern_va(); + if (stackinfo_on_stack(&stack_hyp, *addr, size)) + goto found; + + return false; + +found: + *addr = *addr - stack_hyp.low + stack_kern.low; + return true; +} + +/* + * Convert a KVN nVHE HYP frame record address to a kernel VA + */ +static bool kvm_nvhe_stack_kern_record_va(unsigned long *addr) +{ + return kvm_nvhe_stack_kern_va(addr, 16); +} + +static int unwind_next(struct unwind_state *state) +{ + /* + * The FP is in the hypervisor VA space. Convert it to the kernel VA + * space so it can be unwound by the regular unwind functions. + */ + if (!kvm_nvhe_stack_kern_record_va(&state->fp)) + return -EINVAL; + + return unwind_next_frame_record(state); +} + +static void unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + +/* + * kvm_nvhe_dump_backtrace_entry - Symbolize and print an nVHE backtrace entry + * + * @arg : the hypervisor offset, used for address translation + * @where : the program counter corresponding to the stack frame + */ +static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long va_mask = GENMASK_ULL(__hyp_va_bits - 1, 0); + unsigned long hyp_offset = (unsigned long)arg; + + /* Mask tags and convert to kern addr */ + where = (where & va_mask) + hyp_offset; + kvm_err(" [<%016lx>] %pB\n", where, (void *)(where + kaslr_offset())); + + return true; +} + +static void kvm_nvhe_dump_backtrace_start(void) +{ + kvm_err("nVHE call trace:\n"); +} + +static void kvm_nvhe_dump_backtrace_end(void) +{ + kvm_err("---[ end nVHE call trace ]---\n"); +} + +/* + * hyp_dump_backtrace - Dump the non-protected nVHE backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * The host can directly access HYP stack pages in non-protected + * mode, so the unwinding is done directly from EL1. This removes + * the need for shared buffers between host and hypervisor for + * the stacktrace. + */ +static void hyp_dump_backtrace(unsigned long hyp_offset) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + struct stack_info stacks[] = { + stackinfo_get_overflow_kern_va(), + stackinfo_get_hyp_kern_va(), + }; + struct unwind_state state = { + .stacks = stacks, + .nr_stacks = ARRAY_SIZE(stacks), + }; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + kvm_nvhe_unwind_init(&state, stacktrace_info->fp, stacktrace_info->pc); + + kvm_nvhe_dump_backtrace_start(); + unwind(&state, kvm_nvhe_dump_backtrace_entry, (void *)hyp_offset); + kvm_nvhe_dump_backtrace_end(); +} + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], + pkvm_stacktrace); + +/* + * pkvm_dump_backtrace - Dump the protected nVHE HYP backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * Dumping of the pKVM HYP backtrace is done by reading the + * stack addresses from the shared stacktrace buffer, since the + * host cannot directly access hypervisor memory in protected + * mode. + */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + unsigned long *stacktrace + = (unsigned long *) this_cpu_ptr_nvhe_sym(pkvm_stacktrace); + int i; + + kvm_nvhe_dump_backtrace_start(); + /* The saved stacktrace is terminated by a null entry */ + for (i = 0; + i < ARRAY_SIZE(kvm_nvhe_sym(pkvm_stacktrace)) && stacktrace[i]; + i++) + kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]); + kvm_nvhe_dump_backtrace_end(); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n"); +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +/* + * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + */ +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset) +{ + if (is_protected_kvm_enabled()) + pkvm_dump_backtrace(hyp_offset); + else + hyp_dump_backtrace(hyp_offset); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 94923609753b..c8fd7c6a12a1 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> @@ -6,555 +7,5184 @@ * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Authors: Rusty Russell <rusty@rustcorp.com.au> * Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include <linux/mm.h> +#include <linux/bitfield.h> +#include <linux/bsearch.h> +#include <linux/cacheinfo.h> +#include <linux/debugfs.h> #include <linux/kvm_host.h> +#include <linux/mm.h> +#include <linux/printk.h> #include <linux/uaccess.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_host.h> -#include <asm/kvm_emulate.h> -#include <asm/kvm_coproc.h> +#include <linux/irqchip/arm-gic-v3.h> + +#include <asm/arm_pmuv3.h> #include <asm/cacheflush.h> #include <asm/cputype.h> +#include <asm/debug-monitors.h> +#include <asm/esr.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> +#include <asm/perf_event.h> +#include <asm/sysreg.h> + #include <trace/events/kvm.h> #include "sys_regs.h" +#include "vgic/vgic.h" + +#include "trace.h" /* - * All of this file is extremly similar to the ARM coproc.c, but the - * types are different. My gut feeling is that it should be pretty - * easy to merge, but that would be an ABI breakage -- again. VFP - * would also need to be abstracted. - * * For AArch32, we only take care of what is being trapped. Anything * that has to do with init and userspace access has to go via the * 64bit interface. */ -/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ -static u32 cache_levels; +static u64 sys_reg_to_index(const struct sys_reg_desc *reg); +static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val); + +static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return false; +} + +static bool bad_trap(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r, + const char *msg) +{ + WARN_ONCE(1, "Unexpected %s\n", msg); + print_sys_reg_instr(params); + return undef_access(vcpu, params, r); +} + +static bool read_from_write_only(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r) +{ + return bad_trap(vcpu, params, r, + "sys_reg read to write-only register"); +} + +static bool write_to_read_only(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r) +{ + return bad_trap(vcpu, params, r, + "sys_reg write to read-only register"); +} + +enum sr_loc_attr { + SR_LOC_MEMORY = 0, /* Register definitely in memory */ + SR_LOC_LOADED = BIT(0), /* Register on CPU, unless it cannot */ + SR_LOC_MAPPED = BIT(1), /* Register in a different CPU register */ + SR_LOC_XLATED = BIT(2), /* Register translated to fit another reg */ + SR_LOC_SPECIAL = BIT(3), /* Demanding register, implies loaded */ +}; + +struct sr_loc { + enum sr_loc_attr loc; + enum vcpu_sysreg map_reg; + u64 (*xlate)(u64); +}; + +static enum sr_loc_attr locate_direct_register(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) +{ + switch (reg) { + case SCTLR_EL1: + case CPACR_EL1: + case TTBR0_EL1: + case TTBR1_EL1: + case TCR_EL1: + case TCR2_EL1: + case PIR_EL1: + case PIRE0_EL1: + case POR_EL1: + case ESR_EL1: + case AFSR0_EL1: + case AFSR1_EL1: + case FAR_EL1: + case MAIR_EL1: + case VBAR_EL1: + case CONTEXTIDR_EL1: + case AMAIR_EL1: + case CNTKCTL_EL1: + case ELR_EL1: + case SPSR_EL1: + case ZCR_EL1: + case SCTLR2_EL1: + /* + * EL1 registers which have an ELx2 mapping are loaded if + * we're not in hypervisor context. + */ + return is_hyp_ctxt(vcpu) ? SR_LOC_MEMORY : SR_LOC_LOADED; + + case TPIDR_EL0: + case TPIDRRO_EL0: + case TPIDR_EL1: + case PAR_EL1: + case DACR32_EL2: + case IFSR32_EL2: + case DBGVCR32_EL2: + /* These registers are always loaded, no matter what */ + return SR_LOC_LOADED; + + default: + /* Non-mapped EL2 registers are by definition in memory. */ + return SR_LOC_MEMORY; + } +} + +static void locate_mapped_el2_register(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg, + enum vcpu_sysreg map_reg, + u64 (*xlate)(u64), + struct sr_loc *loc) +{ + if (!is_hyp_ctxt(vcpu)) { + loc->loc = SR_LOC_MEMORY; + return; + } + + loc->loc = SR_LOC_LOADED | SR_LOC_MAPPED; + loc->map_reg = map_reg; + + WARN_ON(locate_direct_register(vcpu, map_reg) != SR_LOC_MEMORY); + + if (xlate != NULL && !vcpu_el2_e2h_is_set(vcpu)) { + loc->loc |= SR_LOC_XLATED; + loc->xlate = xlate; + } +} + +#define MAPPED_EL2_SYSREG(r, m, t) \ + case r: { \ + locate_mapped_el2_register(vcpu, r, m, t, loc); \ + break; \ + } + +static void locate_register(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, + struct sr_loc *loc) +{ + if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) { + loc->loc = SR_LOC_MEMORY; + return; + } + + switch (reg) { + MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, + translate_sctlr_el2_to_sctlr_el1 ); + MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1, + translate_cptr_el2_to_cpacr_el1 ); + MAPPED_EL2_SYSREG(TTBR0_EL2, TTBR0_EL1, + translate_ttbr0_el2_to_ttbr0_el1 ); + MAPPED_EL2_SYSREG(TTBR1_EL2, TTBR1_EL1, NULL ); + MAPPED_EL2_SYSREG(TCR_EL2, TCR_EL1, + translate_tcr_el2_to_tcr_el1 ); + MAPPED_EL2_SYSREG(VBAR_EL2, VBAR_EL1, NULL ); + MAPPED_EL2_SYSREG(AFSR0_EL2, AFSR0_EL1, NULL ); + MAPPED_EL2_SYSREG(AFSR1_EL2, AFSR1_EL1, NULL ); + MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL ); + MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); + MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); + MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL ); + MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL ); + MAPPED_EL2_SYSREG(POR_EL2, POR_EL1, NULL ); + MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); + MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); + MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); + MAPPED_EL2_SYSREG(SCTLR2_EL2, SCTLR2_EL1, NULL ); + case CNTHCTL_EL2: + /* CNTHCTL_EL2 is super special, until we support NV2.1 */ + loc->loc = ((is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) ? + SR_LOC_SPECIAL : SR_LOC_MEMORY); + break; + default: + loc->loc = locate_direct_register(vcpu, reg); + } +} + +static u64 read_sr_from_cpu(enum vcpu_sysreg reg) +{ + u64 val = 0x8badf00d8badf00d; + + switch (reg) { + case SCTLR_EL1: val = read_sysreg_s(SYS_SCTLR_EL12); break; + case CPACR_EL1: val = read_sysreg_s(SYS_CPACR_EL12); break; + case TTBR0_EL1: val = read_sysreg_s(SYS_TTBR0_EL12); break; + case TTBR1_EL1: val = read_sysreg_s(SYS_TTBR1_EL12); break; + case TCR_EL1: val = read_sysreg_s(SYS_TCR_EL12); break; + case TCR2_EL1: val = read_sysreg_s(SYS_TCR2_EL12); break; + case PIR_EL1: val = read_sysreg_s(SYS_PIR_EL12); break; + case PIRE0_EL1: val = read_sysreg_s(SYS_PIRE0_EL12); break; + case POR_EL1: val = read_sysreg_s(SYS_POR_EL12); break; + case ESR_EL1: val = read_sysreg_s(SYS_ESR_EL12); break; + case AFSR0_EL1: val = read_sysreg_s(SYS_AFSR0_EL12); break; + case AFSR1_EL1: val = read_sysreg_s(SYS_AFSR1_EL12); break; + case FAR_EL1: val = read_sysreg_s(SYS_FAR_EL12); break; + case MAIR_EL1: val = read_sysreg_s(SYS_MAIR_EL12); break; + case VBAR_EL1: val = read_sysreg_s(SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break; + case AMAIR_EL1: val = read_sysreg_s(SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: val = read_sysreg_s(SYS_CNTKCTL_EL12); break; + case ELR_EL1: val = read_sysreg_s(SYS_ELR_EL12); break; + case SPSR_EL1: val = read_sysreg_s(SYS_SPSR_EL12); break; + case ZCR_EL1: val = read_sysreg_s(SYS_ZCR_EL12); break; + case SCTLR2_EL1: val = read_sysreg_s(SYS_SCTLR2_EL12); break; + case TPIDR_EL0: val = read_sysreg_s(SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: val = read_sysreg_s(SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: val = read_sysreg_s(SYS_TPIDR_EL1); break; + case PAR_EL1: val = read_sysreg_par(); break; + case DACR32_EL2: val = read_sysreg_s(SYS_DACR32_EL2); break; + case IFSR32_EL2: val = read_sysreg_s(SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: val = read_sysreg_s(SYS_DBGVCR32_EL2); break; + default: WARN_ON_ONCE(1); + } + + return val; +} + +static void write_sr_to_cpu(enum vcpu_sysreg reg, u64 val) +{ + switch (reg) { + case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; + case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; + case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; + case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; + case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; + case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break; + case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break; + case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break; + case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break; + case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; + case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; + case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; + case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break; + case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break; + case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break; + case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; + case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break; + case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break; + case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break; + case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break; + case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break; + case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; + case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; + case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; + default: WARN_ON_ONCE(1); + } +} + +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg) +{ + struct sr_loc loc = {}; + + locate_register(vcpu, reg, &loc); + + WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY); + + if (loc.loc & SR_LOC_SPECIAL) { + u64 val; + + WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL); + + /* + * CNTHCTL_EL2 requires some special treatment to account + * for the bits that can be set via CNTKCTL_EL1 when E2H==1. + */ + switch (reg) { + case CNTHCTL_EL2: + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; + return val; + default: + WARN_ON_ONCE(1); + } + } + + if (loc.loc & SR_LOC_LOADED) { + enum vcpu_sysreg map_reg = reg; + + if (loc.loc & SR_LOC_MAPPED) + map_reg = loc.map_reg; + + if (!(loc.loc & SR_LOC_XLATED)) { + u64 val = read_sr_from_cpu(map_reg); + + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); + + return val; + } + } + + return __vcpu_sys_reg(vcpu, reg); +} + +void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, enum vcpu_sysreg reg) +{ + struct sr_loc loc = {}; + + locate_register(vcpu, reg, &loc); + + WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY); + + if (loc.loc & SR_LOC_SPECIAL) { + + WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL); + + switch (reg) { + case CNTHCTL_EL2: + /* + * If E2H=1, some of the bits are backed by + * CNTKCTL_EL1, while the rest is kept in memory. + * Yes, this is fun stuff. + */ + write_sysreg_el1(val, SYS_CNTKCTL); + break; + default: + WARN_ON_ONCE(1); + } + } + + if (loc.loc & SR_LOC_LOADED) { + enum vcpu_sysreg map_reg = reg; + u64 xlated_val; + + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); + + if (loc.loc & SR_LOC_MAPPED) + map_reg = loc.map_reg; + + if (loc.loc & SR_LOC_XLATED) + xlated_val = loc.xlate(val); + else + xlated_val = val; + + write_sr_to_cpu(map_reg, xlated_val); + + /* + * Fall through to write the backing store anyway, which + * allows translated registers to be directly read without a + * reverse translation. + */ + } + + __vcpu_assign_sys_reg(vcpu, reg, val); +} /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ -#define CSSELR_MAX 12 +#define CSSELR_MAX 14 + +/* + * Returns the minimum line size for the selected cache, expressed as + * Log2(bytes). + */ +static u8 get_min_cache_line_size(bool icache) +{ + u64 ctr = read_sanitised_ftr_reg(SYS_CTR_EL0); + u8 field; + + if (icache) + field = SYS_FIELD_GET(CTR_EL0, IminLine, ctr); + else + field = SYS_FIELD_GET(CTR_EL0, DminLine, ctr); + + /* + * Cache line size is represented as Log2(words) in CTR_EL0. + * Log2(bytes) can be derived with the following: + * + * Log2(words) + 2 = Log2(bytes / 4) + 2 + * = Log2(bytes) - 2 + 2 + * = Log2(bytes) + */ + return field + 2; +} /* Which cache CCSIDR represents depends on CSSELR value. */ -static u32 get_ccsidr(u32 csselr) +static u32 get_ccsidr(struct kvm_vcpu *vcpu, u32 csselr) { - u32 ccsidr; + u8 line_size; - /* Make sure noone else changes CSSELR during this! */ - local_irq_disable(); - /* Put value into CSSELR */ - asm volatile("msr csselr_el1, %x0" : : "r" (csselr)); - isb(); - /* Read result out of CCSIDR */ - asm volatile("mrs %0, ccsidr_el1" : "=r" (ccsidr)); - local_irq_enable(); + if (vcpu->arch.ccsidr) + return vcpu->arch.ccsidr[csselr]; - return ccsidr; + line_size = get_min_cache_line_size(csselr & CSSELR_EL1_InD); + + /* + * Fabricate a CCSIDR value as the overriding value does not exist. + * The real CCSIDR value will not be used as it can vary by the + * physical CPU which the vcpu currently resides in. + * + * The line size is determined with get_min_cache_line_size(), which + * should be valid for all CPUs even if they have different cache + * configuration. + * + * The associativity bits are cleared, meaning the geometry of all data + * and unified caches (which are guaranteed to be PIPT and thus + * non-aliasing) are 1 set and 1 way. + * Guests should not be doing cache operations by set/way at all, and + * for this reason, we trap them and attempt to infer the intent, so + * that we can flush the entire guest's address space at the appropriate + * time. The exposed geometry minimizes the number of the traps. + * [If guests should attempt to infer aliasing properties from the + * geometry (which is not permitted by the architecture), they would + * only do so for virtually indexed caches.] + * + * We don't check if the cache level exists as it is allowed to return + * an UNKNOWN value if not. + */ + return SYS_FIELD_PREP(CCSIDR_EL1, LineSize, line_size - 4); } -static void do_dc_cisw(u32 val) +static int set_ccsidr(struct kvm_vcpu *vcpu, u32 csselr, u32 val) { - asm volatile("dc cisw, %x0" : : "r" (val)); - dsb(); + u8 line_size = FIELD_GET(CCSIDR_EL1_LineSize, val) + 4; + u32 *ccsidr = vcpu->arch.ccsidr; + u32 i; + + if ((val & CCSIDR_EL1_RES0) || + line_size < get_min_cache_line_size(csselr & CSSELR_EL1_InD)) + return -EINVAL; + + if (!ccsidr) { + if (val == get_ccsidr(vcpu, csselr)) + return 0; + + ccsidr = kmalloc_array(CSSELR_MAX, sizeof(u32), GFP_KERNEL_ACCOUNT); + if (!ccsidr) + return -ENOMEM; + + for (i = 0; i < CSSELR_MAX; i++) + ccsidr[i] = get_ccsidr(vcpu, i); + + vcpu->arch.ccsidr = ccsidr; + } + + ccsidr[csselr] = val; + + return 0; } -static void do_dc_csw(u32 val) +static bool access_rw(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) { - asm volatile("dc csw, %x0" : : "r" (val)); - dsb(); + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, r->reg); + else + p->regval = vcpu_read_sys_reg(vcpu, r->reg); + + return true; } -/* See note at ARM ARM B1.14.4 */ +/* + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). + */ static bool access_dcsw(struct kvm_vcpu *vcpu, - const struct sys_reg_params *p, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!p->is_write) + return read_from_write_only(vcpu, p, r); + + /* + * Only track S/W ops if we don't have FWB. It still indicates + * that the guest is a bit broken (S/W operations should only + * be done by firmware, knowing that there is only a single + * CPU left in the system, and certainly not from non-secure + * software). + */ + if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + kvm_set_way_flush(vcpu); + + return true; +} + +static bool access_dcgsw(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_mte(vcpu->kvm)) + return undef_access(vcpu, p, r); + + /* Treat MTE S/W ops as we treat the classic ones: with contempt */ + return access_dcsw(vcpu, p, r); +} + +static void get_access_mask(const struct sys_reg_desc *r, u64 *mask, u64 *shift) +{ + switch (r->aarch32_map) { + case AA32_LO: + *mask = GENMASK_ULL(31, 0); + *shift = 0; + break; + case AA32_HI: + *mask = GENMASK_ULL(63, 32); + *shift = 32; + break; + default: + *mask = GENMASK_ULL(63, 0); + *shift = 0; + break; + } +} + +/* + * Generic accessor for VM registers. Only called as long as HCR_TVM + * is set. If the guest enables the MMU, we stop trapping the VM + * sys_regs and leave it in complete control of the caches. + */ +static bool access_vm_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + bool was_enabled = vcpu_has_cache_enabled(vcpu); + u64 val, mask, shift; + + BUG_ON(!p->is_write); + + get_access_mask(r, &mask, &shift); + + if (~mask) { + val = vcpu_read_sys_reg(vcpu, r->reg); + val &= ~mask; + } else { + val = 0; + } + + val |= (p->regval & (mask >> shift)) << shift; + vcpu_write_sys_reg(vcpu, val, r->reg); + + kvm_toggle_cache(vcpu, was_enabled); + return true; +} + +static bool access_actlr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 mask, shift; + + if (p->is_write) + return ignore_write(vcpu, p); + + get_access_mask(r, &mask, &shift); + p->regval = (vcpu_read_sys_reg(vcpu, r->reg) & mask) >> shift; + + return true; +} + +/* + * Trap handler for the GICv3 SGI generation system register. + * Forward the request to the VGIC emulation. + * The cp15_64 code makes sure this automatically works + * for both AArch64 and AArch32 accesses. + */ +static bool access_gic_sgi(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + bool g1; + + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + + if (!p->is_write) + return read_from_write_only(vcpu, p, r); + + /* + * In a system where GICD_CTLR.DS=1, a ICC_SGI0R_EL1 access generates + * Group0 SGIs only, while ICC_SGI1R_EL1 can generate either group, + * depending on the SGI configuration. ICC_ASGI1R_EL1 is effectively + * equivalent to ICC_SGI0R_EL1, as there is no "alternative" secure + * group. + */ + if (p->Op0 == 0) { /* AArch32 */ + switch (p->Op1) { + default: /* Keep GCC quiet */ + case 0: /* ICC_SGI1R */ + g1 = true; + break; + case 1: /* ICC_ASGI1R */ + case 2: /* ICC_SGI0R */ + g1 = false; + break; + } + } else { /* AArch64 */ + switch (p->Op2) { + default: /* Keep GCC quiet */ + case 5: /* ICC_SGI1R_EL1 */ + g1 = true; + break; + case 6: /* ICC_ASGI1R_EL1 */ + case 7: /* ICC_SGI0R_EL1 */ + g1 = false; + break; + } + } + + vgic_v3_dispatch_sgi(vcpu, p->regval, g1); + + return true; +} + +static bool access_gic_sre(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + + if (p->is_write) + return ignore_write(vcpu, p); + + if (p->Op1 == 4) { /* ICC_SRE_EL2 */ + p->regval = KVM_ICC_SRE_EL2; + } else { /* ICC_SRE_EL1 */ + p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; + } + + return true; +} + +static bool access_gic_dir(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + + if (!p->is_write) + return undef_access(vcpu, p, r); + + vgic_v3_deactivate(vcpu, p->regval); + + return true; +} + +static bool trap_raz_wi(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, const struct sys_reg_desc *r) { - unsigned long val; - int cpu; + if (p->is_write) + return ignore_write(vcpu, p); + else + return read_zero(vcpu, p); +} + +/* + * ARMv8.1 mandates at least a trivial LORegion implementation, where all the + * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 + * system, these registers should UNDEF. LORID_EL1 being a RO register, we + * treat it separately. + */ +static bool trap_loregion(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sr = reg_to_encoding(r); + + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) + return undef_access(vcpu, p, r); + + if (p->is_write && sr == SYS_LORID_EL1) + return write_to_read_only(vcpu, p, r); + + return trap_raz_wi(vcpu, p, r); +} +static bool trap_oslar_el1(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ if (!p->is_write) - return read_from_write_only(vcpu, p); + return read_from_write_only(vcpu, p, r); + + kvm_debug_handle_oslar(vcpu, p->regval); + return true; +} + +static bool trap_oslsr_el1(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = __vcpu_sys_reg(vcpu, r->reg); + return true; +} + +static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) +{ + /* + * The only modifiable bit is the OSLK bit. Refuse the write if + * userspace attempts to change any other bit in the register. + */ + if ((val ^ rd->val) & ~OSLSR_EL1_OSLK) + return -EINVAL; + + __vcpu_assign_sys_reg(vcpu, rd->reg, val); + return 0; +} + +static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + return ignore_write(vcpu, p); + } else { + p->regval = read_sysreg(dbgauthstatus_el1); + return true; + } +} + +static bool trap_debug_regs(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + access_rw(vcpu, p, r); + + kvm_debug_set_guest_ownership(vcpu); + return true; +} + +/* + * reg_to_dbg/dbg_to_reg + * + * A 32 bit write to a debug register leave top bits alone + * A 32 bit read from a debug register only returns the bottom bits + */ +static void reg_to_dbg(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *rd, + u64 *dbg_reg) +{ + u64 mask, shift, val; + + get_access_mask(rd, &mask, &shift); + + val = *dbg_reg; + val &= ~mask; + val |= (p->regval & (mask >> shift)) << shift; + *dbg_reg = val; +} + +static void dbg_to_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *rd, + u64 *dbg_reg) +{ + u64 mask, shift; + + get_access_mask(rd, &mask, &shift); + p->regval = (*dbg_reg & mask) >> shift; +} + +static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) +{ + struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state; + + switch (rd->Op2) { + case 0b100: + return &dbg->dbg_bvr[rd->CRm]; + case 0b101: + return &dbg->dbg_bcr[rd->CRm]; + case 0b110: + return &dbg->dbg_wvr[rd->CRm]; + case 0b111: + return &dbg->dbg_wcr[rd->CRm]; + default: + KVM_BUG_ON(1, vcpu->kvm); + return NULL; + } +} + +static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *rd) +{ + u64 *reg = demux_wb_reg(vcpu, rd); + + if (!reg) + return false; + + if (p->is_write) + reg_to_dbg(vcpu, p, rd, reg); + else + dbg_to_reg(vcpu, p, rd, reg); + + kvm_debug_set_guest_ownership(vcpu); + return true; +} + +static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) +{ + u64 *reg = demux_wb_reg(vcpu, rd); + + if (!reg) + return -EINVAL; + + *reg = val; + return 0; +} + +static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val) +{ + u64 *reg = demux_wb_reg(vcpu, rd); + + if (!reg) + return -EINVAL; + + *val = *reg; + return 0; +} + +static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) +{ + u64 *reg = demux_wb_reg(vcpu, rd); + + /* + * Bail early if we couldn't find storage for the register, the + * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever + * being run. + */ + if (!reg) + return 0; + + *reg = rd->val; + return rd->val; +} + +static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 amair = read_sysreg(amair_el1); + vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1); + return amair; +} + +static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 actlr = read_sysreg(actlr_el1); + vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1); + return actlr; +} + +static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 mpidr; + + /* + * Map the vcpu_id into the first three affinity level fields of + * the MPIDR. We limit the number of VCPUs in level 0 due to a + * limitation to 16 CPUs in that level in the ICC_SGIxR registers + * of the GICv3 to be able to address each CPU directly when + * sending IPIs. + */ + mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); + mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); + mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); + mpidr |= (1ULL << 31); + vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1); + + return mpidr; +} + +static unsigned int hidden_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return REG_HIDDEN; +} + +static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + if (kvm_vcpu_has_pmu(vcpu)) + return 0; + + return REG_HIDDEN; +} + +static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 mask = BIT(ARMV8_PMU_CYCLE_IDX); + u8 n = vcpu->kvm->arch.nr_pmu_counters; + + if (n) + mask |= GENMASK(n - 1, 0); + + reset_unknown(vcpu, r); + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, mask); + + return __vcpu_sys_reg(vcpu, r->reg); +} + +static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, GENMASK(31, 0)); + + return __vcpu_sys_reg(vcpu, r->reg); +} + +static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + /* This thing will UNDEF, who cares about the reset value? */ + if (!kvm_vcpu_has_pmu(vcpu)) + return 0; + + reset_unknown(vcpu, r); + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, kvm_pmu_evtyper_mask(vcpu->kvm)); + + return __vcpu_sys_reg(vcpu, r->reg); +} + +static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, PMSELR_EL0_SEL_MASK); + + return __vcpu_sys_reg(vcpu, r->reg); +} + +static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 pmcr = 0; + + if (!kvm_supports_32bit_el0()) + pmcr |= ARMV8_PMU_PMCR_LC; + + /* + * The value of PMCR.N field is included when the + * vCPU register is read via kvm_vcpu_read_pmcr(). + */ + __vcpu_assign_sys_reg(vcpu, r->reg, pmcr); + + return __vcpu_sys_reg(vcpu, r->reg); +} + +static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) +{ + u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0); + bool enabled = (reg & flags) || vcpu_mode_priv(vcpu); + + if (!enabled) + kvm_inject_undefined(vcpu); + + return !enabled; +} + +static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu) +{ + return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_EN); +} + +static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu) +{ + return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN); +} + +static bool pmu_access_cycle_counter_el0_disabled(struct kvm_vcpu *vcpu) +{ + return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_EN); +} + +static bool pmu_access_event_counter_el0_disabled(struct kvm_vcpu *vcpu) +{ + return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_EN); +} - cpu = get_cpu(); +static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 val; - cpumask_setall(&vcpu->arch.require_dcache_flush); - cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush); + if (pmu_access_el0_disabled(vcpu)) + return false; - /* If we were already preempted, take the long way around */ - if (cpu != vcpu->arch.last_pcpu) { - flush_cache_all(); - goto done; + if (p->is_write) { + /* + * Only update writeable bits of PMCR (continuing into + * kvm_pmu_handle_pmcr() as well) + */ + val = kvm_vcpu_read_pmcr(vcpu); + val &= ~ARMV8_PMU_PMCR_MASK; + val |= p->regval & ARMV8_PMU_PMCR_MASK; + if (!kvm_supports_32bit_el0()) + val |= ARMV8_PMU_PMCR_LC; + kvm_pmu_handle_pmcr(vcpu, val); + } else { + /* PMCR.P & PMCR.C are RAZ */ + val = kvm_vcpu_read_pmcr(vcpu) + & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C); + p->regval = val; } - val = *vcpu_reg(vcpu, p->Rt); + return true; +} + +static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (pmu_access_event_counter_el0_disabled(vcpu)) + return false; + + if (p->is_write) + __vcpu_assign_sys_reg(vcpu, PMSELR_EL0, p->regval); + else + /* return PMSELR.SEL field */ + p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) + & PMSELR_EL0_SEL_MASK; + + return true; +} + +static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 pmceid, mask, shift; + + BUG_ON(p->is_write); + + if (pmu_access_el0_disabled(vcpu)) + return false; + + get_access_mask(r, &mask, &shift); + + pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1)); + pmceid &= mask; + pmceid >>= shift; + + p->regval = pmceid; + + return true; +} - switch (p->CRm) { - case 6: /* Upgrade DCISW to DCCISW, as per HCR.SWIO */ - case 14: /* DCCISW */ - do_dc_cisw(val); +static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) +{ + u64 pmcr, val; + + pmcr = kvm_vcpu_read_pmcr(vcpu); + val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr); + if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { + kvm_inject_undefined(vcpu); + return false; + } + + return true; +} + +static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + *val = kvm_pmu_get_counter_value(vcpu, idx); + return 0; +} + +static int set_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + kvm_pmu_set_counter_value_user(vcpu, idx, val); + return 0; +} + +static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 idx = ~0UL; + + if (r->CRn == 9 && r->CRm == 13) { + if (r->Op2 == 2) { + /* PMXEVCNTR_EL0 */ + if (pmu_access_event_counter_el0_disabled(vcpu)) + return false; + + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); + } else if (r->Op2 == 0) { + /* PMCCNTR_EL0 */ + if (pmu_access_cycle_counter_el0_disabled(vcpu)) + return false; + + idx = ARMV8_PMU_CYCLE_IDX; + } + } else if (r->CRn == 0 && r->CRm == 9) { + /* PMCCNTR */ + if (pmu_access_event_counter_el0_disabled(vcpu)) + return false; + + idx = ARMV8_PMU_CYCLE_IDX; + } else if (r->CRn == 14 && (r->CRm & 12) == 8) { + /* PMEVCNTRn_EL0 */ + if (pmu_access_event_counter_el0_disabled(vcpu)) + return false; + + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + } + + /* Catch any decoding mistake */ + WARN_ON(idx == ~0UL); + + if (!pmu_counter_idx_valid(vcpu, idx)) + return false; + + if (p->is_write) { + if (pmu_access_el0_disabled(vcpu)) + return false; + + kvm_pmu_set_counter_value(vcpu, idx, p->regval); + } else { + p->regval = kvm_pmu_get_counter_value(vcpu, idx); + } + + return true; +} + +static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 idx, reg; + + if (pmu_access_el0_disabled(vcpu)) + return false; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { + /* PMXEVTYPER_EL0 */ + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); + reg = PMEVTYPER0_EL0 + idx; + } else if (r->CRn == 14 && (r->CRm & 12) == 12) { + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + if (idx == ARMV8_PMU_CYCLE_IDX) + reg = PMCCFILTR_EL0; + else + /* PMEVTYPERn_EL0 */ + reg = PMEVTYPER0_EL0 + idx; + } else { + BUG(); + } + + if (!pmu_counter_idx_valid(vcpu, idx)) + return false; + + if (p->is_write) { + kvm_pmu_set_counter_event_type(vcpu, p->regval, idx); + kvm_vcpu_pmu_restore_guest(vcpu); + } else { + p->regval = __vcpu_sys_reg(vcpu, reg); + } + + return true; +} + +static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) +{ + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + + __vcpu_assign_sys_reg(vcpu, r->reg, val & mask); + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + return 0; +} + +static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) +{ + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + + *val = __vcpu_sys_reg(vcpu, r->reg) & mask; + return 0; +} + +static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 val, mask; + + if (pmu_access_el0_disabled(vcpu)) + return false; + + mask = kvm_pmu_accessible_counter_mask(vcpu); + if (p->is_write) { + val = p->regval & mask; + if (r->Op2 & 0x1) + /* accessing PMCNTENSET_EL0 */ + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, |=, val); + else + /* accessing PMCNTENCLR_EL0 */ + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, ~val); + + kvm_pmu_reprogram_counter_mask(vcpu, val); + } else { + p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + } + + return true; +} + +static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + + if (check_pmu_access_disabled(vcpu, 0)) + return false; + + if (p->is_write) { + u64 val = p->regval & mask; + + if (r->Op2 & 0x1) + /* accessing PMINTENSET_EL1 */ + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, |=, val); + else + /* accessing PMINTENCLR_EL1 */ + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, ~val); + } else { + p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); + } + + return true; +} + +static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + + if (pmu_access_el0_disabled(vcpu)) + return false; + + if (p->is_write) { + if (r->CRm & 0x2) + /* accessing PMOVSSET_EL0 */ + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, (p->regval & mask)); + else + /* accessing PMOVSCLR_EL0 */ + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, ~(p->regval & mask)); + } else { + p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); + } + + return true; +} + +static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 mask; + + if (!p->is_write) + return read_from_write_only(vcpu, p, r); + + if (pmu_write_swinc_el0_disabled(vcpu)) + return false; + + mask = kvm_pmu_accessible_counter_mask(vcpu); + kvm_pmu_software_increment(vcpu, p->regval & mask); + return true; +} + +static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + if (!vcpu_mode_priv(vcpu)) + return undef_access(vcpu, p, r); + + __vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0, + (p->regval & ARMV8_PMU_USERENR_MASK)); + } else { + p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0) + & ARMV8_PMU_USERENR_MASK; + } + + return true; +} + +static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = kvm_vcpu_read_pmcr(vcpu); + return 0; +} + +static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val); + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->arch.config_lock); + + /* + * The vCPU can't have more counters than the PMU hardware + * implements. Ignore this error to maintain compatibility + * with the existing KVM behavior. + */ + if (!kvm_vm_has_ran_once(kvm) && + !vcpu_has_nv(vcpu) && + new_n <= kvm_arm_pmu_get_max_counters(kvm)) + kvm->arch.nr_pmu_counters = new_n; + + mutex_unlock(&kvm->arch.config_lock); + + /* + * Ignore writes to RES0 bits, read only bits that are cleared on + * vCPU reset, and writable bits that KVM doesn't support yet. + * (i.e. only PMCR.N and bits [7:0] are mutable from userspace) + * The LP bit is RES0 when FEAT_PMUv3p5 is not supported on the vCPU. + * But, we leave the bit as it is here, as the vCPU's PMUver might + * be changed later (NOTE: the bit will be cleared on first vCPU run + * if necessary). + */ + val &= ARMV8_PMU_PMCR_MASK; + + /* The LC bit is RES1 when AArch32 is not supported */ + if (!kvm_supports_32bit_el0()) + val |= ARMV8_PMU_PMCR_LC; + + __vcpu_assign_sys_reg(vcpu, r->reg, val); + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + return 0; +} + +/* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ +#define DBG_BCR_BVR_WCR_WVR_EL1(n) \ + { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ + { SYS_DESC(SYS_DBGBCRn_EL1(n)), \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ + { SYS_DESC(SYS_DBGWVRn_EL1(n)), \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ + { SYS_DESC(SYS_DBGWCRn_EL1(n)), \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg } + +#define PMU_SYS_REG(name) \ + SYS_DESC(SYS_##name), .reset = reset_pmu_reg, \ + .visibility = pmu_visibility + +/* Macro to expand the PMEVCNTRn_EL0 register */ +#define PMU_PMEVCNTR_EL0(n) \ + { PMU_SYS_REG(PMEVCNTRn_EL0(n)), \ + .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ + .set_user = set_pmu_evcntr, \ + .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } + +/* Macro to expand the PMEVTYPERn_EL0 register */ +#define PMU_PMEVTYPER_EL0(n) \ + { PMU_SYS_REG(PMEVTYPERn_EL0(n)), \ + .reset = reset_pmevtyper, \ + .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } + +/* Macro to expand the AMU counter and type registers*/ +#define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access } +#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access } +#define AMU_AMEVCNTR1_EL0(n) { SYS_DESC(SYS_AMEVCNTR1_EL0(n)), undef_access } +#define AMU_AMEVTYPER1_EL0(n) { SYS_DESC(SYS_AMEVTYPER1_EL0(n)), undef_access } + +static unsigned int ptrauth_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN; +} + +/* + * If we land here on a PtrAuth access, that is because we didn't + * fixup the access on exit by allowing the PtrAuth sysregs. The only + * way this happens is when the guest does not have PtrAuth support + * enabled. + */ +#define __PTRAUTH_KEY(k) \ + { SYS_DESC(SYS_## k), undef_access, reset_unknown, k, \ + .visibility = ptrauth_visibility} + +#define PTRAUTH_KEY(k) \ + __PTRAUTH_KEY(k ## KEYLO_EL1), \ + __PTRAUTH_KEY(k ## KEYHI_EL1) + +static bool access_arch_timer(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + enum kvm_arch_timers tmr; + enum kvm_arch_timer_regs treg; + u64 reg = reg_to_encoding(r); + + switch (reg) { + case SYS_CNTP_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTV_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_AARCH32_CNTP_TVAL: + case SYS_CNTP_TVAL_EL02: + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTV_TVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHP_TVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHV_TVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTP_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTV_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_AARCH32_CNTP_CTL: + case SYS_CNTP_CTL_EL02: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTV_CTL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHP_CTL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHV_CTL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTP_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTV_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_AARCH32_CNTP_CVAL: + case SYS_CNTP_CVAL_EL02: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTV_CVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHP_CVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHV_CVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CNT; + break; + + case SYS_AARCH32_CNTPCT: + case SYS_AARCH32_CNTPCTSS: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CNT; break; - case 10: /* DCCSW */ - do_dc_csw(val); + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; break; + + case SYS_AARCH32_CNTVCT: + case SYS_AARCH32_CNTVCTSS: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + + default: + print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); + return undef_access(vcpu, p, r); } -done: - put_cpu(); + if (p->is_write) + kvm_arm_timer_write_sysreg(vcpu, tmr, treg, p->regval); + else + p->regval = kvm_arm_timer_read_sysreg(vcpu, tmr, treg); return true; } +static int arch_timer_set_user(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + switch (reg_to_encoding(rd)) { + case SYS_CNTV_CTL_EL0: + case SYS_CNTP_CTL_EL0: + case SYS_CNTHV_CTL_EL2: + case SYS_CNTHP_CTL_EL2: + val &= ~ARCH_TIMER_CTRL_IT_STAT; + break; + case SYS_CNTVCT_EL0: + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) + timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read() - val); + return 0; + case SYS_CNTPCT_EL0: + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) + timer_set_offset(vcpu_ptimer(vcpu), kvm_phys_timer_read() - val); + return 0; + } + + __vcpu_assign_sys_reg(vcpu, rd->reg, val); + return 0; +} + +static int arch_timer_get_user(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 *val) +{ + switch (reg_to_encoding(rd)) { + case SYS_CNTVCT_EL0: + *val = kvm_phys_timer_read() - timer_get_offset(vcpu_vtimer(vcpu)); + break; + case SYS_CNTPCT_EL0: + *val = kvm_phys_timer_read() - timer_get_offset(vcpu_ptimer(vcpu)); + break; + default: + *val = __vcpu_sys_reg(vcpu, rd->reg); + } + + return 0; +} + +static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, + s64 new, s64 cur) +{ + struct arm64_ftr_bits kvm_ftr = *ftrp; + + /* Some features have different safe value type in KVM than host features */ + switch (id) { + case SYS_ID_AA64DFR0_EL1: + switch (kvm_ftr.shift) { + case ID_AA64DFR0_EL1_PMUVer_SHIFT: + kvm_ftr.type = FTR_LOWER_SAFE; + break; + case ID_AA64DFR0_EL1_DebugVer_SHIFT: + kvm_ftr.type = FTR_LOWER_SAFE; + break; + } + break; + case SYS_ID_DFR0_EL1: + if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT) + kvm_ftr.type = FTR_LOWER_SAFE; + break; + } + + return arm64_ftr_safe_value(&kvm_ftr, new, cur); +} + /* - * We could trap ID_DFR0 and tell the guest we don't support performance - * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was - * NAKed, so it will read the PMCR anyway. + * arm64_check_features() - Check if a feature register value constitutes + * a subset of features indicated by the idreg's KVM sanitised limit. + * + * This function will check if each feature field of @val is the "safe" value + * against idreg's KVM sanitised limit return from reset() callback. + * If a field value in @val is the same as the one in limit, it is always + * considered the safe value regardless For register fields that are not in + * writable, only the value in limit is considered the safe value. * - * Therefore we tell the guest we have 0 counters. Unfortunately, we - * must always support PMCCNTR (the cycle counter): we just RAZ/WI for - * all PM registers, which doesn't crash the guest kernel at least. + * Return: 0 if all the fields are safe. Otherwise, return negative errno. */ -static bool pm_fake(struct kvm_vcpu *vcpu, - const struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int arm64_check_features(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + const struct arm64_ftr_reg *ftr_reg; + const struct arm64_ftr_bits *ftrp = NULL; + u32 id = reg_to_encoding(rd); + u64 writable_mask = rd->val; + u64 limit = rd->reset(vcpu, rd); + u64 mask = 0; + + /* + * Hidden and unallocated ID registers may not have a corresponding + * struct arm64_ftr_reg. Of course, if the register is RAZ we know the + * only safe value is 0. + */ + if (sysreg_visible_as_raz(vcpu, rd)) + return val ? -E2BIG : 0; + + ftr_reg = get_arm64_ftr_reg(id); + if (!ftr_reg) + return -EINVAL; + + ftrp = ftr_reg->ftr_bits; + + for (; ftrp && ftrp->width; ftrp++) { + s64 f_val, f_lim, safe_val; + u64 ftr_mask; + + ftr_mask = arm64_ftr_mask(ftrp); + if ((ftr_mask & writable_mask) != ftr_mask) + continue; + + f_val = arm64_ftr_value(ftrp, val); + f_lim = arm64_ftr_value(ftrp, limit); + mask |= ftr_mask; + + if (f_val == f_lim) + safe_val = f_val; + else + safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim); + + if (safe_val != f_val) + return -E2BIG; + } + + /* For fields that are not writable, values in limit are the safe values. */ + if ((val & ~mask) != (limit & ~mask)) + return -E2BIG; + + return 0; +} + +static u8 pmuver_to_perfmon(u8 pmuver) +{ + switch (pmuver) { + case ID_AA64DFR0_EL1_PMUVer_IMP: + return ID_DFR0_EL1_PerfMon_PMUv3; + case ID_AA64DFR0_EL1_PMUVer_IMP_DEF: + return ID_DFR0_EL1_PerfMon_IMPDEF; + default: + /* Anything ARMv8.1+ and NI have the same value. For now. */ + return pmuver; + } +} + +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); + +/* Read a sanitised cpufeature ID register by sys_reg_desc */ +static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + u32 id = reg_to_encoding(r); + u64 val; + + if (sysreg_visible_as_raz(vcpu, r)) + return 0; + + val = read_sanitised_ftr_reg(id); + + switch (id) { + case SYS_ID_AA64DFR0_EL1: + val = sanitise_id_aa64dfr0_el1(vcpu, val); + break; + case SYS_ID_AA64PFR0_EL1: + val = sanitise_id_aa64pfr0_el1(vcpu, val); + break; + case SYS_ID_AA64PFR1_EL1: + val = sanitise_id_aa64pfr1_el1(vcpu, val); + break; + case SYS_ID_AA64PFR2_EL1: + val &= ID_AA64PFR2_EL1_FPMR | + (kvm_has_mte(vcpu->kvm) ? + ID_AA64PFR2_EL1_MTEFAR | ID_AA64PFR2_EL1_MTESTOREONLY : + 0); + break; + case SYS_ID_AA64ISAR1_EL1: + if (!vcpu_has_ptrauth(vcpu)) + val &= ~(ID_AA64ISAR1_EL1_APA | + ID_AA64ISAR1_EL1_API | + ID_AA64ISAR1_EL1_GPA | + ID_AA64ISAR1_EL1_GPI); + break; + case SYS_ID_AA64ISAR2_EL1: + if (!vcpu_has_ptrauth(vcpu)) + val &= ~(ID_AA64ISAR2_EL1_APA3 | + ID_AA64ISAR2_EL1_GPA3); + if (!cpus_have_final_cap(ARM64_HAS_WFXT) || + has_broken_cntvoff()) + val &= ~ID_AA64ISAR2_EL1_WFxT; + break; + case SYS_ID_AA64ISAR3_EL1: + val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE | + ID_AA64ISAR3_EL1_FAMINMAX; + break; + case SYS_ID_AA64MMFR2_EL1: + val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; + val &= ~ID_AA64MMFR2_EL1_NV; + break; + case SYS_ID_AA64MMFR3_EL1: + val &= ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_SCTLRX | + ID_AA64MMFR3_EL1_S1POE | + ID_AA64MMFR3_EL1_S1PIE; + break; + case SYS_ID_MMFR4_EL1: + val &= ~ID_MMFR4_EL1_CCIDX; + break; + } + + if (vcpu_has_nv(vcpu)) + val = limit_nv_id_reg(vcpu->kvm, id, val); + + return val; +} + +static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return __kvm_read_sanitised_id_reg(vcpu, r); +} + +static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r)); +} + +static bool is_feature_id_reg(u32 encoding) +{ + return (sys_reg_Op0(encoding) == 3 && + (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) && + sys_reg_CRn(encoding) == 0 && + sys_reg_CRm(encoding) <= 7); +} + +/* + * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is + * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID + * registers KVM maintains on a per-VM basis. + * + * Additionally, the implementation ID registers and CTR_EL0 are handled as + * per-VM registers. + */ +static inline bool is_vm_ftr_id_reg(u32 id) +{ + switch (id) { + case SYS_CTR_EL0: + case SYS_MIDR_EL1: + case SYS_REVIDR_EL1: + case SYS_AIDR_EL1: + return true; + default: + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && + sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && + sys_reg_CRm(id) < 8); + + } +} + +static inline bool is_vcpu_ftr_id_reg(u32 id) +{ + return is_feature_id_reg(id) && !is_vm_ftr_id_reg(id); +} + +static inline bool is_aa32_id_reg(u32 id) +{ + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && + sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && + sys_reg_CRm(id) <= 3); +} + +static unsigned int id_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + u32 id = reg_to_encoding(r); + + switch (id) { + case SYS_ID_AA64ZFR0_EL1: + if (!vcpu_has_sve(vcpu)) + return REG_RAZ; + break; + } + + return 0; +} + +static unsigned int aa32_id_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + /* + * AArch32 ID registers are UNKNOWN if AArch32 isn't implemented at any + * EL. Promote to RAZ/WI in order to guarantee consistency between + * systems. + */ + if (!kvm_supports_32bit_el0()) + return REG_RAZ | REG_USER_WI; + + return id_visibility(vcpu, r); +} + +static unsigned int raz_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return REG_RAZ; +} + +/* cpufeature ID register access trap handlers */ + +static bool access_id_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) { if (p->is_write) - return ignore_write(vcpu, p); + return write_to_read_only(vcpu, p, r); + + p->regval = read_id_reg(vcpu, r); + + return true; +} + +/* Visibility overrides for SVE-specific control registers */ +static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (vcpu_has_sve(vcpu)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int sme_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_fpmr(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + if (!vcpu_has_sve(vcpu)) + val &= ~ID_AA64PFR0_EL1_SVE_MASK; + + /* + * The default is to expose CSV2 == 1 if the HW isn't affected. + * Although this is a per-CPU feature, we make it global because + * asymmetric systems are just a nuisance. + * + * Userspace can override this as long as it doesn't promise + * the impossible. + */ + if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) { + val &= ~ID_AA64PFR0_EL1_CSV2_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP); + } + if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) { + val &= ~ID_AA64PFR0_EL1_CSV3_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP); + } + + if (vgic_is_v3(vcpu->kvm)) { + val &= ~ID_AA64PFR0_EL1_GIC_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + } + + val &= ~ID_AA64PFR0_EL1_AMU_MASK; + + /* + * MPAM is disabled by default as KVM also needs a set of PARTID to + * program the MPAMVPMx_EL2 PARTID remapping registers with. But some + * older kernels let the guest see the ID bit. + */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + + return val; +} + +static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + if (!kvm_has_mte(vcpu->kvm)) { + val &= ~ID_AA64PFR1_EL1_MTE; + val &= ~ID_AA64PFR1_EL1_MTE_frac; + } + + if (!(cpus_have_final_cap(ARM64_HAS_RASV1P1_EXTN) && + SYS_FIELD_GET(ID_AA64PFR0_EL1, RAS, pfr0) == ID_AA64PFR0_EL1_RAS_IMP)) + val &= ~ID_AA64PFR1_EL1_RAS_frac; + + val &= ~ID_AA64PFR1_EL1_SME; + val &= ~ID_AA64PFR1_EL1_RNDR_trap; + val &= ~ID_AA64PFR1_EL1_NMI; + val &= ~ID_AA64PFR1_EL1_GCS; + val &= ~ID_AA64PFR1_EL1_THE; + val &= ~ID_AA64PFR1_EL1_MTEX; + val &= ~ID_AA64PFR1_EL1_PFAR; + val &= ~ID_AA64PFR1_EL1_MPAM_frac; + + return val; +} + +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); + + /* + * Only initialize the PMU version if the vCPU was configured with one. + */ + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; + if (kvm_vcpu_has_pmu(vcpu)) + val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer, + kvm_arm_pmu_get_pmuver_limit()); + + /* Hide SPE from guests */ + val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; + + /* Hide BRBE from guests */ + val &= ~ID_AA64DFR0_EL1_BRBE_MASK; + + return val; +} + +/* + * Older versions of KVM erroneously claim support for FEAT_DoubleLock with + * NV-enabled VMs on unsupporting hardware. Silently ignore the incorrect + * value if it is consistent with the bug. + */ +static bool ignore_feat_doublelock(struct kvm_vcpu *vcpu, u64 val) +{ + u8 host, user; + + if (!vcpu_has_nv(vcpu)) + return false; + + host = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock, + read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); + user = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock, val); + + return host == ID_AA64DFR0_EL1_DoubleLock_NI && + user == ID_AA64DFR0_EL1_DoubleLock_IMP; +} + +static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 debugver = SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, val); + u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); + + /* + * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the + * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously + * exposed an IMP_DEF PMU to userspace and the guest on systems w/ + * non-architectural PMUs. Of course, PMUv3 is the only game in town for + * PMU virtualization, so the IMP_DEF value was rather user-hostile. + * + * At minimum, we're on the hook to allow values that were given to + * userspace by KVM. Cover our tracks here and replace the IMP_DEF value + * with a more sensible NI. The value of an ID register changing under + * the nose of the guest is unfortunate, but is certainly no more + * surprising than an ill-guided PMU driver poking at impdef system + * registers that end in an UNDEF... + */ + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; + + /* + * ID_AA64DFR0_EL1.DebugVer is one of those awkward fields with a + * nonzero minimum safe value. + */ + if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP) + return -EINVAL; + + if (ignore_feat_doublelock(vcpu, val)) { + val &= ~ID_AA64DFR0_EL1_DoubleLock; + val |= SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI); + } + + return set_id_reg(vcpu, rd, val); +} + +static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u8 perfmon; + u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1); + + val &= ~ID_DFR0_EL1_PerfMon_MASK; + if (kvm_vcpu_has_pmu(vcpu)) { + perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon); + } + + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8); + + return val; +} + +static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val); + u8 copdbg = SYS_FIELD_GET(ID_DFR0_EL1, CopDbg, val); + + if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) { + val &= ~ID_DFR0_EL1_PerfMon_MASK; + perfmon = 0; + } + + /* + * Allow DFR0_EL1.PerfMon to be set from userspace as long as + * it doesn't promise more than what the HW gives us on the + * AArch64 side (as everything is emulated with that), and + * that this is a PMUv3. + */ + if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3) + return -EINVAL; + + if (copdbg < ID_DFR0_EL1_CopDbg_Armv8) + return -EINVAL; + + return set_id_reg(vcpu, rd, val); +} + +static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK; + + /* + * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits + * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to + * guests, but didn't add trap handling. KVM doesn't support MPAM and + * always returns an UNDEF for these registers. The guest must see 0 + * for this field. + * + * But KVM must also accept values from user-space that were provided + * by KVM. On CPUs that support MPAM, permit user-space to write + * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field. + */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + + /* Fail the guest's request to disable the AA64 ISA at EL{0,1,2} */ + if (!FIELD_GET(ID_AA64PFR0_EL1_EL0, user_val) || + !FIELD_GET(ID_AA64PFR0_EL1_EL1, user_val) || + (vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val))) + return -EINVAL; + + /* + * If we are running on a GICv5 host and support FEAT_GCIE_LEGACY, then + * we support GICv3. Fail attempts to do anything but set that to IMP. + */ + if (vgic_is_v3_compat(vcpu->kvm) && + FIELD_GET(ID_AA64PFR0_EL1_GIC_MASK, user_val) != ID_AA64PFR0_EL1_GIC_IMP) + return -EINVAL; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; + u8 mte = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE, hw_val); + u8 user_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, user_val); + u8 hw_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, hw_val); + + /* See set_id_aa64pfr0_el1 for comment about MPAM */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + + /* + * Previously MTE_frac was hidden from guest. However, if the + * hardware supports MTE2 but not MTE_ASYM_FAULT then a value + * of 0 for this field indicates that the hardware supports + * MTE_ASYNC. Whereas, 0xf indicates MTE_ASYNC is not supported. + * + * As KVM must accept values from KVM provided by user-space, + * when ID_AA64PFR1_EL1.MTE is 2 allow user-space to set + * ID_AA64PFR1_EL1.MTE_frac to 0. However, ignore it to avoid + * incorrectly claiming hardware support for MTE_ASYNC in the + * guest. + */ + + if (mte == ID_AA64PFR1_EL1_MTE_MTE2 && + hw_mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI && + user_mte_frac == ID_AA64PFR1_EL1_MTE_frac_ASYNC) { + user_val &= ~ID_AA64PFR1_EL1_MTE_frac_MASK; + user_val |= hw_val & ID_AA64PFR1_EL1_MTE_frac_MASK; + } + + return set_id_reg(vcpu, rd, user_val); +} + +/* + * Allow userspace to de-feature a stage-2 translation granule but prevent it + * from claiming the impossible. + */ +#define tgran2_val_allowed(tg, safe, user) \ +({ \ + u8 __s = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, safe); \ + u8 __u = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, user); \ + \ + __s == __u || __u == ID_AA64MMFR0_EL1_##tg##_NI; \ +}) + +static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd); + + if (!vcpu_has_nv(vcpu)) + return set_id_reg(vcpu, rd, user_val); + + if (!tgran2_val_allowed(TGRAN4_2, sanitized_val, user_val) || + !tgran2_val_allowed(TGRAN16_2, sanitized_val, user_val) || + !tgran2_val_allowed(TGRAN64_2, sanitized_val, user_val)) + return -EINVAL; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK; + + /* + * We made the mistake to expose the now deprecated NV field, + * so allow userspace to write it, but silently ignore it. + */ + if ((hw_val & nv_mask) == (user_val & nv_mask)) + user_val &= ~nv_mask; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_ctr_el0(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val); + + /* + * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved. + * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based + * on what hardware reports. + * + * Using a VIPT software model on PIPT will lead to over invalidation, + * but still correct. Hence, we can allow downgrading PIPT to VIPT, + * but not the other way around. This is handled via arm64_ftr_safe_value() + * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value + * set as VIPT. + */ + switch (user_L1Ip) { + case CTR_EL0_L1Ip_RESERVED_VPIPT: + case CTR_EL0_L1Ip_RESERVED_AIVIVT: + return -EINVAL; + case CTR_EL0_L1Ip_VIPT: + case CTR_EL0_L1Ip_PIPT: + return set_id_reg(vcpu, rd, user_val); + default: + return -ENOENT; + } +} + +/* + * cpufeature ID register user accessors + * + * For now, these registers are immutable for userspace, so no values + * are stored, and for set_id_reg() we don't allow the effective value + * to be changed. + */ +static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val) +{ + /* + * Avoid locking if the VM has already started, as the ID registers are + * guaranteed to be invariant at that point. + */ + if (kvm_vm_has_ran_once(vcpu->kvm)) { + *val = read_id_reg(vcpu, rd); + return 0; + } + + mutex_lock(&vcpu->kvm->arch.config_lock); + *val = read_id_reg(vcpu, rd); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return 0; +} + +static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) +{ + u32 id = reg_to_encoding(rd); + int ret; + + mutex_lock(&vcpu->kvm->arch.config_lock); + + /* + * Once the VM has started the ID registers are immutable. Reject any + * write that does not match the final register value. + */ + if (kvm_vm_has_ran_once(vcpu->kvm)) { + if (val != read_id_reg(vcpu, rd)) + ret = -EBUSY; + else + ret = 0; + + mutex_unlock(&vcpu->kvm->arch.config_lock); + return ret; + } + + ret = arm64_check_features(vcpu, rd, val); + if (!ret) + kvm_set_vm_id_reg(vcpu->kvm, id, val); + + mutex_unlock(&vcpu->kvm->arch.config_lock); + + /* + * arm64_check_features() returns -E2BIG to indicate the register's + * feature set is a superset of the maximally-allowed register value. + * While it would be nice to precisely describe this to userspace, the + * existing UAPI for KVM_SET_ONE_REG has it that invalid register + * writes return -EINVAL. + */ + if (ret == -E2BIG) + ret = -EINVAL; + return ret; +} + +void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val) +{ + u64 *p = __vm_id_reg(&kvm->arch, reg); + + lockdep_assert_held(&kvm->arch.config_lock); + + if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm)) + return; + + *p = val; +} + +static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val) +{ + *val = 0; + return 0; +} + +static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) +{ + return 0; +} + +static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0); + return true; +} + +static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = __vcpu_sys_reg(vcpu, r->reg); + return true; +} + +/* + * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary + * by the physical CPU which the vcpu currently resides in. + */ +static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); + u64 clidr; + u8 loc; + + if ((ctr_el0 & CTR_EL0_IDC)) { + /* + * Data cache clean to the PoU is not required so LoUU and LoUIS + * will not be set and a unified cache, which will be marked as + * LoC, will be added. + * + * If not DIC, let the unified cache L2 so that an instruction + * cache can be added as L1 later. + */ + loc = (ctr_el0 & CTR_EL0_DIC) ? 1 : 2; + clidr = CACHE_TYPE_UNIFIED << CLIDR_CTYPE_SHIFT(loc); + } else { + /* + * Data cache clean to the PoU is required so let L1 have a data + * cache and mark it as LoUU and LoUIS. As L1 has a data cache, + * it can be marked as LoC too. + */ + loc = 1; + clidr = 1 << CLIDR_LOUU_SHIFT; + clidr |= 1 << CLIDR_LOUIS_SHIFT; + clidr |= CACHE_TYPE_DATA << CLIDR_CTYPE_SHIFT(1); + } + + /* + * Instruction cache invalidation to the PoU is required so let L1 have + * an instruction cache. If L1 already has a data cache, it will be + * CACHE_TYPE_SEPARATE. + */ + if (!(ctr_el0 & CTR_EL0_DIC)) + clidr |= CACHE_TYPE_INST << CLIDR_CTYPE_SHIFT(1); + + clidr |= loc << CLIDR_LOC_SHIFT; + + /* + * Add tag cache unified to data cache. Allocation tags and data are + * unified in a cache line so that it looks valid even if there is only + * one cache line. + */ + if (kvm_has_mte(vcpu->kvm)) + clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc); + + __vcpu_assign_sys_reg(vcpu, r->reg, clidr); + + return __vcpu_sys_reg(vcpu, r->reg); +} + +static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) +{ + u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); + u64 idc = !CLIDR_LOC(val) || (!CLIDR_LOUIS(val) && !CLIDR_LOUU(val)); + + if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc)) + return -EINVAL; + + __vcpu_assign_sys_reg(vcpu, rd->reg, val); + + return 0; +} + +static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + int reg = r->reg; + + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, reg); else - return read_zero(vcpu, p); + p->regval = vcpu_read_sys_reg(vcpu, reg); + return true; +} + +static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 csselr; + + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); + csselr &= CSSELR_EL1_Level | CSSELR_EL1_InD; + if (csselr < CSSELR_MAX) + p->regval = get_ccsidr(vcpu, csselr); + + return true; +} + +static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_mte(vcpu->kvm)) + return 0; + + return REG_HIDDEN; } -static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +#define MTE_REG(name) { \ + SYS_DESC(SYS_##name), \ + .access = undef_access, \ + .reset = reset_unknown, \ + .reg = name, \ + .visibility = mte_visibility, \ +} + +static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) { - u64 amair; + if (vcpu_has_nv(vcpu)) + return 0; + + return REG_HIDDEN; +} - asm volatile("mrs %0, amair_el1\n" : "=r" (amair)); - vcpu_sys_reg(vcpu, AMAIR_EL1) = amair; +static bool bad_vncr_trap(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* + * We really shouldn't be here, and this is likely the result + * of a misconfigured trap, as this register should target the + * VNCR page, and nothing else. + */ + return bad_trap(vcpu, p, r, + "trap of VNCR-backed register"); } -static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static bool bad_redir_trap(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) { /* - * Simply map the vcpu_id into the Aff0 field of the MPIDR. + * We really shouldn't be here, and this is likely the result + * of a misconfigured trap, as this register should target the + * corresponding EL1, and nothing else. */ - vcpu_sys_reg(vcpu, MPIDR_EL1) = (1UL << 31) | (vcpu->vcpu_id & 0xff); + return bad_trap(vcpu, p, r, + "trap of EL2 register redirected to EL1"); +} + +#define SYS_REG_USER_FILTER(name, acc, rst, v, gu, su, filter) { \ + SYS_DESC(SYS_##name), \ + .access = acc, \ + .reset = rst, \ + .reg = name, \ + .get_user = gu, \ + .set_user = su, \ + .visibility = filter, \ + .val = v, \ +} + +#define EL2_REG_FILTERED(name, acc, rst, v, filter) \ + SYS_REG_USER_FILTER(name, acc, rst, v, NULL, NULL, filter) + +#define EL2_REG(name, acc, rst, v) \ + EL2_REG_FILTERED(name, acc, rst, v, el2_visibility) + +#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) +#define EL2_REG_VNCR_FILT(name, vis) \ + EL2_REG_FILTERED(name, bad_vncr_trap, reset_val, 0, vis) +#define EL2_REG_VNCR_GICv3(name) \ + EL2_REG_VNCR_FILT(name, hidden_visibility) +#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) + +#define TIMER_REG(name, vis) \ + SYS_REG_USER_FILTER(name, access_arch_timer, reset_val, 0, \ + arch_timer_get_user, arch_timer_set_user, vis) + +/* + * Since reset() callback and field val are not used for idregs, they will be + * used for specific purposes for idregs. + * The reset() would return KVM sanitised register value. The value would be the + * same as the host kernel sanitised value if there is no KVM sanitisation. + * The val would be used as a mask indicating writable fields for the idreg. + * Only bits with 1 are writable from userspace. This mask might not be + * necessary in the future whenever all ID registers are enabled as writable + * from userspace. + */ + +#define ID_DESC_DEFAULT_CALLBACKS \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = id_visibility, \ + .reset = kvm_read_sanitised_id_reg + +#define ID_DESC(name) \ + SYS_DESC(SYS_##name), \ + ID_DESC_DEFAULT_CALLBACKS + +/* sys_reg_desc initialiser for known cpufeature ID registers */ +#define ID_SANITISED(name) { \ + ID_DESC(name), \ + .val = 0, \ +} + +/* sys_reg_desc initialiser for writable ID registers */ +#define ID_WRITABLE(name, mask) { \ + ID_DESC(name), \ + .val = mask, \ +} + +/* + * 32bit ID regs are fully writable when the guest is 32bit + * capable. Nothing in the KVM code should rely on 32bit features + * anyway, only 64bit, so let the VMM do its worse. + */ +#define AA32_ID_WRITABLE(name) { \ + ID_DESC(name), \ + .visibility = aa32_id_visibility, \ + .val = GENMASK(31, 0), \ +} + +/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ +#define ID_FILTERED(sysreg, name, mask) { \ + ID_DESC(sysreg), \ + .set_user = set_##name, \ + .val = (mask), \ +} + +/* + * sys_reg_desc initialiser for architecturally unallocated cpufeature ID + * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 + * (1 <= crm < 8, 0 <= Op2 < 8). + */ +#define ID_UNALLOCATED(crm, op2) { \ + .name = "S3_0_0_" #crm "_" #op2, \ + Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ + ID_DESC_DEFAULT_CALLBACKS, \ + .visibility = raz_visibility, \ + .val = 0, \ +} + +/* + * sys_reg_desc initialiser for known ID registers that we hide from guests. + * For now, these are exposed just like unallocated ID regs: they appear + * RAZ for the guest. + */ +#define ID_HIDDEN(name) { \ + ID_DESC(name), \ + .visibility = raz_visibility, \ + .val = 0, \ +} + +static bool access_sp_el1(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_assign_sys_reg(vcpu, SP_EL1, p->regval); + else + p->regval = __vcpu_sys_reg(vcpu, SP_EL1); + + return true; +} + +static bool access_elr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, ELR_EL1); + else + p->regval = vcpu_read_sys_reg(vcpu, ELR_EL1); + + return true; +} + +static bool access_spsr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_assign_sys_reg(vcpu, SPSR_EL1, p->regval); + else + p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1); + + return true; +} + +static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_assign_sys_reg(vcpu, CNTKCTL_EL1, p->regval); + else + p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); + + return true; +} + +static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 val = r->val; + + if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) + val |= HCR_E2H; + + __vcpu_assign_sys_reg(vcpu, r->reg, val); + + return __vcpu_sys_reg(vcpu, r->reg); +} + +static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + unsigned int (*fn)(const struct kvm_vcpu *, + const struct sys_reg_desc *)) +{ + return el2_visibility(vcpu, rd) ?: fn(vcpu, rd); +} + +static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, sve_visibility); +} + +static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (el2_visibility(vcpu, rd) == 0 && + kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int sctlr2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_sctlr2(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int sctlr2_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, sctlr2_visibility); +} + +static bool access_zcr_el2(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + unsigned int vq; + + if (guest_hyp_sve_traps_enabled(vcpu)) { + kvm_inject_nested_sve_trap(vcpu); + return false; + } + + if (!p->is_write) { + p->regval = __vcpu_sys_reg(vcpu, ZCR_EL2); + return true; + } + + vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; + vq = min(vq, vcpu_sve_max_vq(vcpu)); + __vcpu_assign_sys_reg(vcpu, ZCR_EL2, vq - 1); + return true; +} + +static bool access_gic_vtr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = kvm_get_guest_vtr_el2(); + + return true; +} + +static bool access_gic_misr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_misr(vcpu); + + return true; +} + +static bool access_gic_eisr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_eisr(vcpu); + + return true; +} + +static bool access_gic_elrsr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_elrsr(vcpu); + + return true; +} + +static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_s1poe(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1poe_visibility); +} + +static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_tcr2(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, tcr2_visibility); +} + +static unsigned int fgt2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (el2_visibility(vcpu, rd) == 0 && + kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, FGT2)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int fgt_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (el2_visibility(vcpu, rd) == 0 && + kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_s1pie(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1pie_visibility); +} + +static unsigned int cnthv_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (vcpu_has_nv(vcpu) && + !vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2_E2H0)) + return 0; + + return REG_HIDDEN; +} + +static bool access_mdcr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 hpmn, val, old = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!p->is_write) { + p->regval = old; + return true; + } + + val = p->regval; + hpmn = FIELD_GET(MDCR_EL2_HPMN, val); + + /* + * If HPMN is out of bounds, limit it to what we actually + * support. This matches the UNKNOWN definition of the field + * in that case, and keeps the emulation simple. Sort of. + */ + if (hpmn > vcpu->kvm->arch.nr_pmu_counters) { + hpmn = vcpu->kvm->arch.nr_pmu_counters; + u64p_replace_bits(&val, hpmn, MDCR_EL2_HPMN); + } + + __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); + + /* + * Request a reload of the PMU to enable/disable the counters + * affected by HPME. + */ + if ((old ^ val) & MDCR_EL2_HPME) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + return true; +} + +static bool access_ras(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + struct kvm *kvm = vcpu->kvm; + + switch(reg_to_encoding(r)) { + case SYS_ERXPFGCDN_EL1: + case SYS_ERXPFGCTL_EL1: + case SYS_ERXPFGF_EL1: + case SYS_ERXMISC2_EL1: + case SYS_ERXMISC3_EL1: + if (!(kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) || + (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) && + kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1)))) { + kvm_inject_undefined(vcpu); + return false; + } + break; + default: + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + } + + return trap_raz_wi(vcpu, p, r); +} + +/* + * For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and + * AIDR_EL1 as "invariant" registers, meaning userspace cannot change them. + * The values made visible to userspace were the register values of the boot + * CPU. + * + * At the same time, reads from these registers at EL1 previously were not + * trapped, allowing the guest to read the actual hardware value. On big-little + * machines, this means the VM can see different values depending on where a + * given vCPU got scheduled. + * + * These registers are now trapped as collateral damage from SME, and what + * follows attempts to give a user / guest view consistent with the existing + * ABI. + */ +static bool access_imp_id_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + /* + * Return the VM-scoped implementation ID register values if userspace + * has made them writable. + */ + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &vcpu->kvm->arch.flags)) + return access_id_reg(vcpu, p, r); + + /* + * Otherwise, fall back to the old behavior of returning the value of + * the current CPU. + */ + switch (reg_to_encoding(r)) { + case SYS_REVIDR_EL1: + p->regval = read_sysreg(revidr_el1); + break; + case SYS_AIDR_EL1: + p->regval = read_sysreg(aidr_el1); + break; + default: + WARN_ON_ONCE(1); + } + + return true; +} + +static u64 __ro_after_init boot_cpu_midr_val; +static u64 __ro_after_init boot_cpu_revidr_val; +static u64 __ro_after_init boot_cpu_aidr_val; + +static void init_imp_id_regs(void) +{ + boot_cpu_midr_val = read_sysreg(midr_el1); + boot_cpu_revidr_val = read_sysreg(revidr_el1); + boot_cpu_aidr_val = read_sysreg(aidr_el1); +} + +static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + switch (reg_to_encoding(r)) { + case SYS_MIDR_EL1: + return boot_cpu_midr_val; + case SYS_REVIDR_EL1: + return boot_cpu_revidr_val; + case SYS_AIDR_EL1: + return boot_cpu_aidr_val; + default: + KVM_BUG_ON(1, vcpu->kvm); + return 0; + } +} + +static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct kvm *kvm = vcpu->kvm; + u64 expected; + + guard(mutex)(&kvm->arch.config_lock); + + expected = read_id_reg(vcpu, r); + if (expected == val) + return 0; + + if (!test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags)) + return -EINVAL; + + /* + * Once the VM has started the ID registers are immutable. Reject the + * write if userspace tries to change it. + */ + if (kvm_vm_has_ran_once(kvm)) + return -EBUSY; + + /* + * Any value is allowed for the implementation ID registers so long as + * it is within the writable mask. + */ + if ((val & r->val) != val) + return -EINVAL; + + kvm_set_vm_id_reg(kvm, reg_to_encoding(r), val); + return 0; +} + +#define IMPLEMENTATION_ID(reg, mask) { \ + SYS_DESC(SYS_##reg), \ + .access = access_imp_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_imp_id_reg, \ + .reset = reset_imp_id_reg, \ + .val = mask, \ + } + +static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + __vcpu_assign_sys_reg(vcpu, r->reg, vcpu->kvm->arch.nr_pmu_counters); + return vcpu->kvm->arch.nr_pmu_counters; } /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 + * + * Debug handling: We do trap most, if not all debug related system + * registers. The implementation is good enough to ensure that a guest + * can use these with minimal performance degradation. The drawback is + * that we don't implement any of the external debug architecture. + * This should be revisited if we ever encounter a more demanding + * guest... */ static const struct sys_reg_desc sys_reg_descs[] = { - /* DC ISW */ - { Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b0110), Op2(0b010), - access_dcsw }, - /* DC CSW */ - { Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1010), Op2(0b010), - access_dcsw }, - /* DC CISW */ - { Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1110), Op2(0b010), - access_dcsw }, - - /* TEECR32_EL1 */ - { Op0(0b10), Op1(0b010), CRn(0b0000), CRm(0b0000), Op2(0b000), - NULL, reset_val, TEECR32_EL1, 0 }, - /* TEEHBR32_EL1 */ - { Op0(0b10), Op1(0b010), CRn(0b0001), CRm(0b0000), Op2(0b000), - NULL, reset_val, TEEHBR32_EL1, 0 }, - /* DBGVCR32_EL2 */ - { Op0(0b10), Op1(0b100), CRn(0b0000), CRm(0b0111), Op2(0b000), - NULL, reset_val, DBGVCR32_EL2, 0 }, - - /* MPIDR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b101), - NULL, reset_mpidr, MPIDR_EL1 }, - /* SCTLR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000), - NULL, reset_val, SCTLR_EL1, 0x00C50078 }, - /* CPACR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010), - NULL, reset_val, CPACR_EL1, 0 }, - /* TTBR0_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b000), - NULL, reset_unknown, TTBR0_EL1 }, - /* TTBR1_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b001), - NULL, reset_unknown, TTBR1_EL1 }, - /* TCR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b010), - NULL, reset_val, TCR_EL1, 0 }, - - /* AFSR0_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b000), - NULL, reset_unknown, AFSR0_EL1 }, - /* AFSR1_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b001), - NULL, reset_unknown, AFSR1_EL1 }, - /* ESR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0010), Op2(0b000), - NULL, reset_unknown, ESR_EL1 }, - /* FAR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000), - NULL, reset_unknown, FAR_EL1 }, - - /* PMINTENSET_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b001), - pm_fake }, - /* PMINTENCLR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b010), - pm_fake }, - - /* MAIR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000), - NULL, reset_unknown, MAIR_EL1 }, - /* AMAIR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0011), Op2(0b000), - NULL, reset_amair_el1, AMAIR_EL1 }, - - /* VBAR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000), - NULL, reset_val, VBAR_EL1, 0 }, - /* CONTEXTIDR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001), - NULL, reset_val, CONTEXTIDR_EL1, 0 }, - /* TPIDR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b100), - NULL, reset_unknown, TPIDR_EL1 }, - - /* CNTKCTL_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1110), CRm(0b0001), Op2(0b000), - NULL, reset_val, CNTKCTL_EL1, 0}, - - /* CSSELR_EL1 */ - { Op0(0b11), Op1(0b010), CRn(0b0000), CRm(0b0000), Op2(0b000), - NULL, reset_unknown, CSSELR_EL1 }, - - /* PMCR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b000), - pm_fake }, - /* PMCNTENSET_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b001), - pm_fake }, - /* PMCNTENCLR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b010), - pm_fake }, - /* PMOVSCLR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b011), - pm_fake }, - /* PMSWINC_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b100), - pm_fake }, - /* PMSELR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b101), - pm_fake }, - /* PMCEID0_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b110), - pm_fake }, - /* PMCEID1_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b111), - pm_fake }, - /* PMCCNTR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000), - pm_fake }, - /* PMXEVTYPER_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b001), - pm_fake }, - /* PMXEVCNTR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b010), - pm_fake }, - /* PMUSERENR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b000), - pm_fake }, - /* PMOVSSET_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b011), - pm_fake }, - - /* TPIDR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b010), - NULL, reset_unknown, TPIDR_EL0 }, - /* TPIDRRO_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011), - NULL, reset_unknown, TPIDRRO_EL0 }, - - /* DACR32_EL2 */ - { Op0(0b11), Op1(0b100), CRn(0b0011), CRm(0b0000), Op2(0b000), - NULL, reset_unknown, DACR32_EL2 }, - /* IFSR32_EL2 */ - { Op0(0b11), Op1(0b100), CRn(0b0101), CRm(0b0000), Op2(0b001), - NULL, reset_unknown, IFSR32_EL2 }, - /* FPEXC32_EL2 */ - { Op0(0b11), Op1(0b100), CRn(0b0101), CRm(0b0011), Op2(0b000), - NULL, reset_val, FPEXC32_EL2, 0x70 }, + DBG_BCR_BVR_WCR_WVR_EL1(0), + DBG_BCR_BVR_WCR_WVR_EL1(1), + { SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 }, + { SYS_DESC(SYS_MDSCR_EL1), trap_debug_regs, reset_val, MDSCR_EL1, 0 }, + DBG_BCR_BVR_WCR_WVR_EL1(2), + DBG_BCR_BVR_WCR_WVR_EL1(3), + DBG_BCR_BVR_WCR_WVR_EL1(4), + DBG_BCR_BVR_WCR_WVR_EL1(5), + DBG_BCR_BVR_WCR_WVR_EL1(6), + DBG_BCR_BVR_WCR_WVR_EL1(7), + DBG_BCR_BVR_WCR_WVR_EL1(8), + DBG_BCR_BVR_WCR_WVR_EL1(9), + DBG_BCR_BVR_WCR_WVR_EL1(10), + DBG_BCR_BVR_WCR_WVR_EL1(11), + DBG_BCR_BVR_WCR_WVR_EL1(12), + DBG_BCR_BVR_WCR_WVR_EL1(13), + DBG_BCR_BVR_WCR_WVR_EL1(14), + DBG_BCR_BVR_WCR_WVR_EL1(15), + + { SYS_DESC(SYS_MDRAR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_OSLAR_EL1), trap_oslar_el1 }, + { SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1, reset_val, OSLSR_EL1, + OSLSR_EL1_OSLM_IMPLEMENTED, .set_user = set_oslsr_el1, }, + { SYS_DESC(SYS_OSDLR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_DBGPRCR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_DBGCLAIMSET_EL1), trap_raz_wi }, + { SYS_DESC(SYS_DBGCLAIMCLR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_DBGAUTHSTATUS_EL1), trap_dbgauthstatus_el1 }, + + { SYS_DESC(SYS_MDCCSR_EL0), trap_raz_wi }, + { SYS_DESC(SYS_DBGDTR_EL0), trap_raz_wi }, + // DBGDTR[TR]X_EL0 share the same encoding + { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, + + { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, + + IMPLEMENTATION_ID(MIDR_EL1, GENMASK_ULL(31, 0)), + { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, + IMPLEMENTATION_ID(REVIDR_EL1, GENMASK_ULL(63, 0)), + + /* + * ID regs: all ID_SANITISED() entries here must have corresponding + * entries in arm64_ftr_regs[]. + */ + + /* AArch64 mappings of the AArch32 ID registers */ + /* CRm=1 */ + AA32_ID_WRITABLE(ID_PFR0_EL1), + AA32_ID_WRITABLE(ID_PFR1_EL1), + { SYS_DESC(SYS_ID_DFR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_dfr0_el1, + .visibility = aa32_id_visibility, + .reset = read_sanitised_id_dfr0_el1, + .val = GENMASK(31, 0) }, + ID_HIDDEN(ID_AFR0_EL1), + AA32_ID_WRITABLE(ID_MMFR0_EL1), + AA32_ID_WRITABLE(ID_MMFR1_EL1), + AA32_ID_WRITABLE(ID_MMFR2_EL1), + AA32_ID_WRITABLE(ID_MMFR3_EL1), + + /* CRm=2 */ + AA32_ID_WRITABLE(ID_ISAR0_EL1), + AA32_ID_WRITABLE(ID_ISAR1_EL1), + AA32_ID_WRITABLE(ID_ISAR2_EL1), + AA32_ID_WRITABLE(ID_ISAR3_EL1), + AA32_ID_WRITABLE(ID_ISAR4_EL1), + AA32_ID_WRITABLE(ID_ISAR5_EL1), + AA32_ID_WRITABLE(ID_MMFR4_EL1), + AA32_ID_WRITABLE(ID_ISAR6_EL1), + + /* CRm=3 */ + AA32_ID_WRITABLE(MVFR0_EL1), + AA32_ID_WRITABLE(MVFR1_EL1), + AA32_ID_WRITABLE(MVFR2_EL1), + ID_UNALLOCATED(3,3), + AA32_ID_WRITABLE(ID_PFR2_EL1), + ID_HIDDEN(ID_DFR1_EL1), + AA32_ID_WRITABLE(ID_MMFR5_EL1), + ID_UNALLOCATED(3,7), + + /* AArch64 ID registers */ + /* CRm=4 */ + ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1, + ~(ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SVE | + ID_AA64PFR0_EL1_AdvSIMD | + ID_AA64PFR0_EL1_FP)), + ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, + ~(ID_AA64PFR1_EL1_PFAR | + ID_AA64PFR1_EL1_MTEX | + ID_AA64PFR1_EL1_THE | + ID_AA64PFR1_EL1_GCS | + ID_AA64PFR1_EL1_MTE_frac | + ID_AA64PFR1_EL1_NMI | + ID_AA64PFR1_EL1_RNDR_trap | + ID_AA64PFR1_EL1_SME | + ID_AA64PFR1_EL1_RES0 | + ID_AA64PFR1_EL1_MPAM_frac | + ID_AA64PFR1_EL1_MTE)), + ID_WRITABLE(ID_AA64PFR2_EL1, + ID_AA64PFR2_EL1_FPMR | + ID_AA64PFR2_EL1_MTEFAR | + ID_AA64PFR2_EL1_MTESTOREONLY), + ID_UNALLOCATED(4,3), + ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), + ID_HIDDEN(ID_AA64SMFR0_EL1), + ID_UNALLOCATED(4,6), + ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), + + /* CRm=5 */ + /* + * Prior to FEAT_Debugv8.9, the architecture defines context-aware + * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). + * KVM does not trap + emulate the breakpoint registers, and as such + * cannot support a layout that misaligns with the underlying hardware. + * While it may be possible to describe a subset that aligns with + * hardware, just prevent changes to BRPs and CTX_CMPs altogether for + * simplicity. + * + * See DDI0487K.a, section D2.8.3 Breakpoint types and linking + * of breakpoints for more details. + */ + ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1, + ID_AA64DFR0_EL1_DoubleLock_MASK | + ID_AA64DFR0_EL1_WRPs_MASK | + ID_AA64DFR0_EL1_PMUVer_MASK | + ID_AA64DFR0_EL1_DebugVer_MASK), + ID_SANITISED(ID_AA64DFR1_EL1), + ID_UNALLOCATED(5,2), + ID_UNALLOCATED(5,3), + ID_HIDDEN(ID_AA64AFR0_EL1), + ID_HIDDEN(ID_AA64AFR1_EL1), + ID_UNALLOCATED(5,6), + ID_UNALLOCATED(5,7), + + /* CRm=6 */ + ID_WRITABLE(ID_AA64ISAR0_EL1, ~ID_AA64ISAR0_EL1_RES0), + ID_WRITABLE(ID_AA64ISAR1_EL1, ~(ID_AA64ISAR1_EL1_GPI | + ID_AA64ISAR1_EL1_GPA | + ID_AA64ISAR1_EL1_API | + ID_AA64ISAR1_EL1_APA)), + ID_WRITABLE(ID_AA64ISAR2_EL1, ~(ID_AA64ISAR2_EL1_RES0 | + ID_AA64ISAR2_EL1_APA3 | + ID_AA64ISAR2_EL1_GPA3)), + ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT | + ID_AA64ISAR3_EL1_LSFE | + ID_AA64ISAR3_EL1_FAMINMAX)), + ID_UNALLOCATED(6,4), + ID_UNALLOCATED(6,5), + ID_UNALLOCATED(6,6), + ID_UNALLOCATED(6,7), + + /* CRm=7 */ + ID_FILTERED(ID_AA64MMFR0_EL1, id_aa64mmfr0_el1, + ~(ID_AA64MMFR0_EL1_RES0 | + ID_AA64MMFR0_EL1_ASIDBITS)), + ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | + ID_AA64MMFR1_EL1_XNX | + ID_AA64MMFR1_EL1_VH | + ID_AA64MMFR1_EL1_VMIDBits)), + ID_FILTERED(ID_AA64MMFR2_EL1, + id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 | + ID_AA64MMFR2_EL1_EVT | + ID_AA64MMFR2_EL1_FWB | + ID_AA64MMFR2_EL1_IDS | + ID_AA64MMFR2_EL1_NV | + ID_AA64MMFR2_EL1_CCIDX)), + ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_SCTLRX | + ID_AA64MMFR3_EL1_S1PIE | + ID_AA64MMFR3_EL1_S1POE)), + ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac), + ID_UNALLOCATED(7,5), + ID_UNALLOCATED(7,6), + ID_UNALLOCATED(7,7), + + { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 }, + { SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 }, + { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 }, + { SYS_DESC(SYS_SCTLR2_EL1), access_vm_reg, reset_val, SCTLR2_EL1, 0, + .visibility = sctlr2_visibility }, + + MTE_REG(RGSR_EL1), + MTE_REG(GCR_EL1), + + { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility }, + { SYS_DESC(SYS_TRFCR_EL1), undef_access }, + { SYS_DESC(SYS_SMPRI_EL1), undef_access }, + { SYS_DESC(SYS_SMCR_EL1), undef_access }, + { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, + { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, + { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, + { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0, + .visibility = tcr2_visibility }, + + PTRAUTH_KEY(APIA), + PTRAUTH_KEY(APIB), + PTRAUTH_KEY(APDA), + PTRAUTH_KEY(APDB), + PTRAUTH_KEY(APGA), + + { SYS_DESC(SYS_SPSR_EL1), access_spsr}, + { SYS_DESC(SYS_ELR_EL1), access_elr}, + + { SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, + + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, + { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, + { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, + + { SYS_DESC(SYS_ERRIDR_EL1), access_ras }, + { SYS_DESC(SYS_ERRSELR_EL1), access_ras }, + { SYS_DESC(SYS_ERXFR_EL1), access_ras }, + { SYS_DESC(SYS_ERXCTLR_EL1), access_ras }, + { SYS_DESC(SYS_ERXSTATUS_EL1), access_ras }, + { SYS_DESC(SYS_ERXADDR_EL1), access_ras }, + { SYS_DESC(SYS_ERXPFGF_EL1), access_ras }, + { SYS_DESC(SYS_ERXPFGCTL_EL1), access_ras }, + { SYS_DESC(SYS_ERXPFGCDN_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC0_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC1_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC2_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC3_EL1), access_ras }, + + MTE_REG(TFSR_EL1), + MTE_REG(TFSRE0_EL1), + + { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 }, + { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, + + { SYS_DESC(SYS_PMSCR_EL1), undef_access }, + { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access }, + { SYS_DESC(SYS_PMSICR_EL1), undef_access }, + { SYS_DESC(SYS_PMSIRR_EL1), undef_access }, + { SYS_DESC(SYS_PMSFCR_EL1), undef_access }, + { SYS_DESC(SYS_PMSEVFR_EL1), undef_access }, + { SYS_DESC(SYS_PMSLATFR_EL1), undef_access }, + { SYS_DESC(SYS_PMSIDR_EL1), undef_access }, + { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access }, + { SYS_DESC(SYS_PMBPTR_EL1), undef_access }, + { SYS_DESC(SYS_PMBSR_EL1), undef_access }, + { SYS_DESC(SYS_PMSDSFR_EL1), undef_access }, + /* PMBIDR_EL1 is not trapped */ + + { PMU_SYS_REG(PMINTENSET_EL1), + .access = access_pminten, .reg = PMINTENSET_EL1, + .get_user = get_pmreg, .set_user = set_pmreg }, + { PMU_SYS_REG(PMINTENCLR_EL1), + .access = access_pminten, .reg = PMINTENSET_EL1, + .get_user = get_pmreg, .set_user = set_pmreg }, + { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, + + { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, + { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1, + .visibility = s1pie_visibility }, + { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1, + .visibility = s1pie_visibility }, + { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, + .visibility = s1poe_visibility }, + { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, + + { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, + { SYS_DESC(SYS_LOREA_EL1), trap_loregion }, + { SYS_DESC(SYS_LORN_EL1), trap_loregion }, + { SYS_DESC(SYS_LORC_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAMIDR_EL1), undef_access }, + { SYS_DESC(SYS_LORID_EL1), trap_loregion }, + + { SYS_DESC(SYS_MPAM1_EL1), undef_access }, + { SYS_DESC(SYS_MPAM0_EL1), undef_access }, + { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, + { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, + + { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, + { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, + { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, + { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, + { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, + + { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, + { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, + + { SYS_DESC(SYS_ACCDATA_EL1), undef_access }, + + { SYS_DESC(SYS_SCXTNUM_EL1), undef_access }, + + { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, + + { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, + { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1, + .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 }, + { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, + { SYS_DESC(SYS_SMIDR_EL1), undef_access }, + IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)), + { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, + ID_FILTERED(CTR_EL0, ctr_el0, + CTR_EL0_DIC_MASK | + CTR_EL0_IDC_MASK | + CTR_EL0_DminLine_MASK | + CTR_EL0_L1Ip_MASK | + CTR_EL0_IminLine_MASK), + { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, + { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, + + { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, + .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, + { PMU_SYS_REG(PMCNTENSET_EL0), + .access = access_pmcnten, .reg = PMCNTENSET_EL0, + .get_user = get_pmreg, .set_user = set_pmreg }, + { PMU_SYS_REG(PMCNTENCLR_EL0), + .access = access_pmcnten, .reg = PMCNTENSET_EL0, + .get_user = get_pmreg, .set_user = set_pmreg }, + { PMU_SYS_REG(PMOVSCLR_EL0), + .access = access_pmovs, .reg = PMOVSSET_EL0, + .get_user = get_pmreg, .set_user = set_pmreg }, + /* + * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was + * previously (and pointlessly) advertised in the past... + */ + { PMU_SYS_REG(PMSWINC_EL0), + .get_user = get_raz_reg, .set_user = set_wi_reg, + .access = access_pmswinc, .reset = NULL }, + { PMU_SYS_REG(PMSELR_EL0), + .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, + { PMU_SYS_REG(PMCEID0_EL0), + .access = access_pmceid, .reset = NULL }, + { PMU_SYS_REG(PMCEID1_EL0), + .access = access_pmceid, .reset = NULL }, + { PMU_SYS_REG(PMCCNTR_EL0), + .access = access_pmu_evcntr, .reset = reset_unknown, + .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr, + .set_user = set_pmu_evcntr }, + { PMU_SYS_REG(PMXEVTYPER_EL0), + .access = access_pmu_evtyper, .reset = NULL }, + { PMU_SYS_REG(PMXEVCNTR_EL0), + .access = access_pmu_evcntr, .reset = NULL }, + /* + * PMUSERENR_EL0 resets as unknown in 64bit mode while it resets as zero + * in 32bit mode. Here we choose to reset it as zero for consistency. + */ + { PMU_SYS_REG(PMUSERENR_EL0), .access = access_pmuserenr, + .reset = reset_val, .reg = PMUSERENR_EL0, .val = 0 }, + { PMU_SYS_REG(PMOVSSET_EL0), + .access = access_pmovs, .reg = PMOVSSET_EL0, + .get_user = get_pmreg, .set_user = set_pmreg }, + + { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0, + .visibility = s1poe_visibility }, + { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, + { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, + { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, + + { SYS_DESC(SYS_SCXTNUM_EL0), undef_access }, + + { SYS_DESC(SYS_AMCR_EL0), undef_access }, + { SYS_DESC(SYS_AMCFGR_EL0), undef_access }, + { SYS_DESC(SYS_AMCGCR_EL0), undef_access }, + { SYS_DESC(SYS_AMUSERENR_EL0), undef_access }, + { SYS_DESC(SYS_AMCNTENCLR0_EL0), undef_access }, + { SYS_DESC(SYS_AMCNTENSET0_EL0), undef_access }, + { SYS_DESC(SYS_AMCNTENCLR1_EL0), undef_access }, + { SYS_DESC(SYS_AMCNTENSET1_EL0), undef_access }, + AMU_AMEVCNTR0_EL0(0), + AMU_AMEVCNTR0_EL0(1), + AMU_AMEVCNTR0_EL0(2), + AMU_AMEVCNTR0_EL0(3), + AMU_AMEVCNTR0_EL0(4), + AMU_AMEVCNTR0_EL0(5), + AMU_AMEVCNTR0_EL0(6), + AMU_AMEVCNTR0_EL0(7), + AMU_AMEVCNTR0_EL0(8), + AMU_AMEVCNTR0_EL0(9), + AMU_AMEVCNTR0_EL0(10), + AMU_AMEVCNTR0_EL0(11), + AMU_AMEVCNTR0_EL0(12), + AMU_AMEVCNTR0_EL0(13), + AMU_AMEVCNTR0_EL0(14), + AMU_AMEVCNTR0_EL0(15), + AMU_AMEVTYPER0_EL0(0), + AMU_AMEVTYPER0_EL0(1), + AMU_AMEVTYPER0_EL0(2), + AMU_AMEVTYPER0_EL0(3), + AMU_AMEVTYPER0_EL0(4), + AMU_AMEVTYPER0_EL0(5), + AMU_AMEVTYPER0_EL0(6), + AMU_AMEVTYPER0_EL0(7), + AMU_AMEVTYPER0_EL0(8), + AMU_AMEVTYPER0_EL0(9), + AMU_AMEVTYPER0_EL0(10), + AMU_AMEVTYPER0_EL0(11), + AMU_AMEVTYPER0_EL0(12), + AMU_AMEVTYPER0_EL0(13), + AMU_AMEVTYPER0_EL0(14), + AMU_AMEVTYPER0_EL0(15), + AMU_AMEVCNTR1_EL0(0), + AMU_AMEVCNTR1_EL0(1), + AMU_AMEVCNTR1_EL0(2), + AMU_AMEVCNTR1_EL0(3), + AMU_AMEVCNTR1_EL0(4), + AMU_AMEVCNTR1_EL0(5), + AMU_AMEVCNTR1_EL0(6), + AMU_AMEVCNTR1_EL0(7), + AMU_AMEVCNTR1_EL0(8), + AMU_AMEVCNTR1_EL0(9), + AMU_AMEVCNTR1_EL0(10), + AMU_AMEVCNTR1_EL0(11), + AMU_AMEVCNTR1_EL0(12), + AMU_AMEVCNTR1_EL0(13), + AMU_AMEVCNTR1_EL0(14), + AMU_AMEVCNTR1_EL0(15), + AMU_AMEVTYPER1_EL0(0), + AMU_AMEVTYPER1_EL0(1), + AMU_AMEVTYPER1_EL0(2), + AMU_AMEVTYPER1_EL0(3), + AMU_AMEVTYPER1_EL0(4), + AMU_AMEVTYPER1_EL0(5), + AMU_AMEVTYPER1_EL0(6), + AMU_AMEVTYPER1_EL0(7), + AMU_AMEVTYPER1_EL0(8), + AMU_AMEVTYPER1_EL0(9), + AMU_AMEVTYPER1_EL0(10), + AMU_AMEVTYPER1_EL0(11), + AMU_AMEVTYPER1_EL0(12), + AMU_AMEVTYPER1_EL0(13), + AMU_AMEVTYPER1_EL0(14), + AMU_AMEVTYPER1_EL0(15), + + { SYS_DESC(SYS_CNTPCT_EL0), .access = access_arch_timer, + .get_user = arch_timer_get_user, .set_user = arch_timer_set_user }, + { SYS_DESC(SYS_CNTVCT_EL0), .access = access_arch_timer, + .get_user = arch_timer_get_user, .set_user = arch_timer_set_user }, + { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, + TIMER_REG(CNTP_CTL_EL0, NULL), + TIMER_REG(CNTP_CVAL_EL0, NULL), + + { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, + TIMER_REG(CNTV_CTL_EL0, NULL), + TIMER_REG(CNTV_CVAL_EL0, NULL), + + /* PMEVCNTRn_EL0 */ + PMU_PMEVCNTR_EL0(0), + PMU_PMEVCNTR_EL0(1), + PMU_PMEVCNTR_EL0(2), + PMU_PMEVCNTR_EL0(3), + PMU_PMEVCNTR_EL0(4), + PMU_PMEVCNTR_EL0(5), + PMU_PMEVCNTR_EL0(6), + PMU_PMEVCNTR_EL0(7), + PMU_PMEVCNTR_EL0(8), + PMU_PMEVCNTR_EL0(9), + PMU_PMEVCNTR_EL0(10), + PMU_PMEVCNTR_EL0(11), + PMU_PMEVCNTR_EL0(12), + PMU_PMEVCNTR_EL0(13), + PMU_PMEVCNTR_EL0(14), + PMU_PMEVCNTR_EL0(15), + PMU_PMEVCNTR_EL0(16), + PMU_PMEVCNTR_EL0(17), + PMU_PMEVCNTR_EL0(18), + PMU_PMEVCNTR_EL0(19), + PMU_PMEVCNTR_EL0(20), + PMU_PMEVCNTR_EL0(21), + PMU_PMEVCNTR_EL0(22), + PMU_PMEVCNTR_EL0(23), + PMU_PMEVCNTR_EL0(24), + PMU_PMEVCNTR_EL0(25), + PMU_PMEVCNTR_EL0(26), + PMU_PMEVCNTR_EL0(27), + PMU_PMEVCNTR_EL0(28), + PMU_PMEVCNTR_EL0(29), + PMU_PMEVCNTR_EL0(30), + /* PMEVTYPERn_EL0 */ + PMU_PMEVTYPER_EL0(0), + PMU_PMEVTYPER_EL0(1), + PMU_PMEVTYPER_EL0(2), + PMU_PMEVTYPER_EL0(3), + PMU_PMEVTYPER_EL0(4), + PMU_PMEVTYPER_EL0(5), + PMU_PMEVTYPER_EL0(6), + PMU_PMEVTYPER_EL0(7), + PMU_PMEVTYPER_EL0(8), + PMU_PMEVTYPER_EL0(9), + PMU_PMEVTYPER_EL0(10), + PMU_PMEVTYPER_EL0(11), + PMU_PMEVTYPER_EL0(12), + PMU_PMEVTYPER_EL0(13), + PMU_PMEVTYPER_EL0(14), + PMU_PMEVTYPER_EL0(15), + PMU_PMEVTYPER_EL0(16), + PMU_PMEVTYPER_EL0(17), + PMU_PMEVTYPER_EL0(18), + PMU_PMEVTYPER_EL0(19), + PMU_PMEVTYPER_EL0(20), + PMU_PMEVTYPER_EL0(21), + PMU_PMEVTYPER_EL0(22), + PMU_PMEVTYPER_EL0(23), + PMU_PMEVTYPER_EL0(24), + PMU_PMEVTYPER_EL0(25), + PMU_PMEVTYPER_EL0(26), + PMU_PMEVTYPER_EL0(27), + PMU_PMEVTYPER_EL0(28), + PMU_PMEVTYPER_EL0(29), + PMU_PMEVTYPER_EL0(30), + /* + * PMCCFILTR_EL0 resets as unknown in 64bit mode while it resets as zero + * in 32bit mode. Here we choose to reset it as zero for consistency. + */ + { PMU_SYS_REG(PMCCFILTR_EL0), .access = access_pmu_evtyper, + .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 }, + + EL2_REG_VNCR(VPIDR_EL2, reset_unknown, 0), + EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0), + EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), + EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), + EL2_REG_FILTERED(SCTLR2_EL2, access_vm_reg, reset_val, 0, + sctlr2_el2_visibility), + EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), + EL2_REG(MDCR_EL2, access_mdcr, reset_mdcr, 0), + EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), + EL2_REG_VNCR(HSTR_EL2, reset_val, 0), + EL2_REG_VNCR_FILT(HFGRTR_EL2, fgt_visibility), + EL2_REG_VNCR_FILT(HFGWTR_EL2, fgt_visibility), + EL2_REG_VNCR(HFGITR_EL2, reset_val, 0), + EL2_REG_VNCR(HACR_EL2, reset_val, 0), + + EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0, + sve_el2_visibility), + + EL2_REG_VNCR(HCRX_EL2, reset_val, 0), + + EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), + EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), + EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), + EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1, + tcr2_el2_visibility), + EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), + EL2_REG_VNCR(VTCR_EL2, reset_val, 0), + EL2_REG_FILTERED(VNCR_EL2, bad_vncr_trap, reset_val, 0, + vncr_el2_visibility), + + { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, + EL2_REG_VNCR_FILT(HDFGRTR2_EL2, fgt2_visibility), + EL2_REG_VNCR_FILT(HDFGWTR2_EL2, fgt2_visibility), + EL2_REG_VNCR_FILT(HFGRTR2_EL2, fgt2_visibility), + EL2_REG_VNCR_FILT(HFGWTR2_EL2, fgt2_visibility), + EL2_REG_VNCR_FILT(HDFGRTR_EL2, fgt_visibility), + EL2_REG_VNCR_FILT(HDFGWTR_EL2, fgt_visibility), + EL2_REG_VNCR_FILT(HAFGRTR_EL2, fgt_visibility), + EL2_REG_VNCR_FILT(HFGITR2_EL2, fgt2_visibility), + EL2_REG_REDIR(SPSR_EL2, reset_val, 0), + EL2_REG_REDIR(ELR_EL2, reset_val, 0), + { SYS_DESC(SYS_SP_EL1), access_sp_el1}, + + /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ + { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi }, + + { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 }, + EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), + EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), + EL2_REG_REDIR(ESR_EL2, reset_val, 0), + EL2_REG_VNCR(VSESR_EL2, reset_unknown, 0), + { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 }, + + EL2_REG_REDIR(FAR_EL2, reset_val, 0), + EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), + + EL2_REG(MAIR_EL2, access_rw, reset_val, 0), + EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0, + s1pie_el2_visibility), + EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0, + s1pie_el2_visibility), + EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0, + s1poe_el2_visibility), + EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_MPAMHCR_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access }, + { SYS_DESC(SYS_MPAM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access }, + + EL2_REG(VBAR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_RVBAR_EL2), undef_access }, + { SYS_DESC(SYS_RMR_EL2), undef_access }, + EL2_REG_VNCR(VDISR_EL2, reset_unknown, 0), + + EL2_REG_VNCR_GICv3(ICH_AP0R0_EL2), + EL2_REG_VNCR_GICv3(ICH_AP0R1_EL2), + EL2_REG_VNCR_GICv3(ICH_AP0R2_EL2), + EL2_REG_VNCR_GICv3(ICH_AP0R3_EL2), + EL2_REG_VNCR_GICv3(ICH_AP1R0_EL2), + EL2_REG_VNCR_GICv3(ICH_AP1R1_EL2), + EL2_REG_VNCR_GICv3(ICH_AP1R2_EL2), + EL2_REG_VNCR_GICv3(ICH_AP1R3_EL2), + + { SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre }, + + EL2_REG_VNCR_GICv3(ICH_HCR_EL2), + { SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr }, + { SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr }, + { SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr }, + { SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr }, + EL2_REG_VNCR_GICv3(ICH_VMCR_EL2), + + EL2_REG_VNCR_GICv3(ICH_LR0_EL2), + EL2_REG_VNCR_GICv3(ICH_LR1_EL2), + EL2_REG_VNCR_GICv3(ICH_LR2_EL2), + EL2_REG_VNCR_GICv3(ICH_LR3_EL2), + EL2_REG_VNCR_GICv3(ICH_LR4_EL2), + EL2_REG_VNCR_GICv3(ICH_LR5_EL2), + EL2_REG_VNCR_GICv3(ICH_LR6_EL2), + EL2_REG_VNCR_GICv3(ICH_LR7_EL2), + EL2_REG_VNCR_GICv3(ICH_LR8_EL2), + EL2_REG_VNCR_GICv3(ICH_LR9_EL2), + EL2_REG_VNCR_GICv3(ICH_LR10_EL2), + EL2_REG_VNCR_GICv3(ICH_LR11_EL2), + EL2_REG_VNCR_GICv3(ICH_LR12_EL2), + EL2_REG_VNCR_GICv3(ICH_LR13_EL2), + EL2_REG_VNCR_GICv3(ICH_LR14_EL2), + EL2_REG_VNCR_GICv3(ICH_LR15_EL2), + + EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), + EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), + + EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), + EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, + TIMER_REG(CNTHP_CTL_EL2, el2_visibility), + TIMER_REG(CNTHP_CVAL_EL2, el2_visibility), + + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer, .visibility = cnthv_visibility }, + TIMER_REG(CNTHV_CTL_EL2, cnthv_visibility), + TIMER_REG(CNTHV_CVAL_EL2, cnthv_visibility), + + { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, + + { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer }, + + { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer }, + + EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; -/* Trapped cp15 registers */ -static const struct sys_reg_desc cp15_regs[] = { +static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (__kvm_at_s1e01(vcpu, op, p->regval)) + return false; + + return true; +} + +static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + /* There is no FGT associated with AT S1E2A :-( */ + if (op == OP_AT_S1E2A && + !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + if (__kvm_at_s1e2(vcpu, op, p->regval)) + return false; + + return true; +} + +static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (__kvm_at_s12(vcpu, op, p->regval)) + return false; + + return true; +} + +static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + + if (sys_reg_CRn(instr) == TLBI_CRn_nXS && + !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) + return false; + + if (CRm == TLBI_CRm_nROS && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + return true; +} + +static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + write_lock(&vcpu->kvm->mmu_lock); + /* - * DC{C,I,CI}SW operations: + * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the + * corresponding VMIDs. */ - { Op1( 0), CRn( 7), CRm( 6), Op2( 2), access_dcsw }, - { Op1( 0), CRn( 7), CRm(10), Op2( 2), access_dcsw }, - { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw }, - { Op1( 0), CRn( 9), CRm(12), Op2( 0), pm_fake }, - { Op1( 0), CRn( 9), CRm(12), Op2( 1), pm_fake }, - { Op1( 0), CRn( 9), CRm(12), Op2( 2), pm_fake }, - { Op1( 0), CRn( 9), CRm(12), Op2( 3), pm_fake }, - { Op1( 0), CRn( 9), CRm(12), Op2( 5), pm_fake }, - { Op1( 0), CRn( 9), CRm(12), Op2( 6), pm_fake }, - { Op1( 0), CRn( 9), CRm(12), Op2( 7), pm_fake }, - { Op1( 0), CRn( 9), CRm(13), Op2( 0), pm_fake }, - { Op1( 0), CRn( 9), CRm(13), Op2( 1), pm_fake }, - { Op1( 0), CRn( 9), CRm(13), Op2( 2), pm_fake }, - { Op1( 0), CRn( 9), CRm(14), Op2( 0), pm_fake }, - { Op1( 0), CRn( 9), CRm(14), Op2( 1), pm_fake }, - { Op1( 0), CRn( 9), CRm(14), Op2( 2), pm_fake }, + kvm_nested_s2_unmap(vcpu->kvm, true); + + write_unlock(&vcpu->kvm->mmu_lock); + + return true; +} + +static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + u8 Op2 = sys_reg_Op2(instr); + + if (sys_reg_CRn(instr) == TLBI_CRn_nXS && + !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) + return false; + + if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + return true; +} + +/* Only defined here as this is an internal "abstraction" */ +union tlbi_info { + struct { + u64 start; + u64 size; + } range; + + struct { + u64 addr; + } ipa; + + struct { + u64 addr; + u32 encoding; + } va; }; -/* Target specific emulation tables */ -static struct kvm_sys_reg_target_table *target_tables[KVM_ARM_NUM_TARGETS]; +static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + /* + * The unmap operation is allowed to drop the MMU lock and block, which + * means that @mmu could be used for a different context than the one + * currently being invalidated. + * + * This behavior is still safe, as: + * + * 1) The vCPU(s) that recycled the MMU are responsible for invalidating + * the entire MMU before reusing it, which still honors the intent + * of a TLBI. + * + * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC + * and ERET to the guest), other vCPUs are allowed to use stale + * translations. + * + * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and + * at worst may cause more aborts for shadow stage-2 fills. + * + * Dropping the MMU lock also implies that shadow stage-2 fills could + * happen behind the back of the TLBI. This is still safe, though, as + * the L1 needs to put its stage-2 in a consistent state before doing + * the TLBI. + */ + kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); +} + +static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 limit, vttbr; + + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm)); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .range = { + .start = 0, + .size = limit, + }, + }, + s2_mmu_unmap_range); + + return true; +} + +static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + u64 base, range; + + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + /* + * Because the shadow S2 structure doesn't necessarily reflect that + * of the guest's S2 (different base granule size, for example), we + * decide to ignore TTL and only use the described range. + */ + base = decode_range_tlbi(p->regval, &range, NULL); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .range = { + .start = base, + .size = range, + }, + }, + s2_mmu_unmap_range); + + return true; +} + +static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + unsigned long max_size; + u64 base_addr; + + /* + * We drop a number of things from the supplied value: + * + * - NS bit: we're non-secure only. + * + * - IPA[51:48]: We don't support 52bit IPA just yet... + * + * And of course, adjust the IPA to be on an actual address. + */ + base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12; + max_size = compute_tlb_inval_range(mmu, info->ipa.addr); + base_addr &= ~(max_size - 1); + + /* + * See comment in s2_mmu_unmap_range() for why this is allowed to + * reschedule. + */ + kvm_stage2_unmap_range(mmu, base_addr, max_size, true); +} + +static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .ipa = { + .addr = p->regval, + }, + }, + s2_mmu_unmap_ipa); + + return true; +} + +static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding)); +} -void kvm_register_target_sys_reg_table(unsigned int target, - struct kvm_sys_reg_target_table *table) +static bool handle_tlbi_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) { - target_tables[target] = table; + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (!kvm_supported_tlbi_s1e2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); + return true; } -/* Get specific register table for this target. */ -static const struct sys_reg_desc *get_target_table(unsigned target, - bool mode_is_64, - size_t *num) +static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) { - struct kvm_sys_reg_target_table *table; + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + /* + * If we're here, this is because we've trapped on a EL1 TLBI + * instruction that affects the EL1 translation regime while + * we're running in a context that doesn't allow us to let the + * HW do its thing (aka vEL2): + * + * - HCR_EL2.E2H == 0 : a non-VHE guest + * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode + * + * Another possibility is that we are invalidating the EL2 context + * using EL1 instructions, but that we landed here because we need + * additional invalidation for structures that are not held in the + * CPU TLBs (such as the VNCR pseudo-TLB and its EL2 mapping). In + * that case, we are guaranteed that HCR_EL2.{E2H,TGE} == { 1, 1 } + * as we don't allow an NV-capable L1 in a nVHE configuration. + * + * We don't expect these helpers to ever be called when running + * in a vEL1 context. + */ + + WARN_ON(!vcpu_is_el2(vcpu)); + + if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) { + kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); + return true; + } + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, + get_vmid(__vcpu_sys_reg(vcpu, VTTBR_EL2)), + &(union tlbi_info) { + .va = { + .addr = p->regval, + .encoding = sys_encoding, + }, + }, + s2_mmu_tlbi_s1e1); + + return true; +} + +#define SYS_INSN(insn, access_fn) \ + { \ + SYS_DESC(OP_##insn), \ + .access = (access_fn), \ + } + +static struct sys_reg_desc sys_insn_descs[] = { + { SYS_DESC(SYS_DC_ISW), access_dcsw }, + { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, + { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, + + SYS_INSN(AT_S1E1R, handle_at_s1e01), + SYS_INSN(AT_S1E1W, handle_at_s1e01), + SYS_INSN(AT_S1E0R, handle_at_s1e01), + SYS_INSN(AT_S1E0W, handle_at_s1e01), + SYS_INSN(AT_S1E1RP, handle_at_s1e01), + SYS_INSN(AT_S1E1WP, handle_at_s1e01), + + { SYS_DESC(SYS_DC_CSW), access_dcsw }, + { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CISW), access_dcsw }, + { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, + + SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1), + + SYS_INSN(AT_S1E2R, handle_at_s1e2), + SYS_INSN(AT_S1E2W, handle_at_s1e2), + SYS_INSN(AT_S12E1R, handle_at_s12), + SYS_INSN(AT_S12E1W, handle_at_s12), + SYS_INSN(AT_S12E0R, handle_at_s12), + SYS_INSN(AT_S12E0W, handle_at_s12), + SYS_INSN(AT_S1E2A, handle_at_s1e2), + + SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), + + SYS_INSN(TLBI_ALLE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE1OS, handle_alle1is), + SYS_INSN(TLBI_VALE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), + + SYS_INSN(TLBI_RVAE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2IS, handle_tlbi_el2), + + SYS_INSN(TLBI_ALLE1IS, handle_alle1is), + + SYS_INSN(TLBI_VALE2IS, handle_tlbi_el2), + + SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), + SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), + SYS_INSN(TLBI_RVAE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_RVAE2, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2, handle_tlbi_el2), + + SYS_INSN(TLBI_ALLE1, handle_alle1is), + + SYS_INSN(TLBI_VALE2, handle_tlbi_el2), + + SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), + + SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), + + SYS_INSN(TLBI_ALLE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), + SYS_INSN(TLBI_VALE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), + + SYS_INSN(TLBI_RVAE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2ISNXS, handle_tlbi_el2), + + SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), + SYS_INSN(TLBI_VALE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), + SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), + SYS_INSN(TLBI_RVAE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVAE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), + SYS_INSN(TLBI_VALE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), +}; - table = target_tables[target]; - if (mode_is_64) { - *num = table->table64.num; - return table->table64.table; +static bool trap_dbgdidr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + return ignore_write(vcpu, p); } else { - *num = table->table32.num; - return table->table32.table; + u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1); + u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP); + + p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) | + (SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr) << 24) | + (SYS_FIELD_GET(ID_AA64DFR0_EL1, CTX_CMPs, dfr) << 20) | + (SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, dfr) << 16) | + (1 << 15) | (el3 << 14) | (el3 << 12)); + return true; } } -static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num) +/* + * AArch32 debug register mappings + * + * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0] + * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32] + * + * None of the other registers share their location, so treat them as + * if they were 64bit. + */ +#define DBG_BCR_BVR_WCR_WVR(n) \ + /* DBGBVRn */ \ + { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), \ + trap_dbg_wb_reg, NULL, n }, \ + /* DBGBCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWVRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n } + +#define DBGBXVR(n) \ + { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), \ + trap_dbg_wb_reg, NULL, n } + +/* + * Trapped cp14 registers. We generally ignore most of the external + * debug, on the principle that they don't really make sense to a + * guest. Revisit this one day, would this principle change. + */ +static const struct sys_reg_desc cp14_regs[] = { + /* DBGDIDR */ + { Op1( 0), CRn( 0), CRm( 0), Op2( 0), trap_dbgdidr }, + /* DBGDTRRXext */ + { Op1( 0), CRn( 0), CRm( 0), Op2( 2), trap_raz_wi }, + + DBG_BCR_BVR_WCR_WVR(0), + /* DBGDSCRint */ + { Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi }, + DBG_BCR_BVR_WCR_WVR(1), + /* DBGDCCINT */ + { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug_regs, NULL, MDCCINT_EL1 }, + /* DBGDSCRext */ + { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug_regs, NULL, MDSCR_EL1 }, + DBG_BCR_BVR_WCR_WVR(2), + /* DBGDTR[RT]Xint */ + { Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi }, + /* DBGDTR[RT]Xext */ + { Op1( 0), CRn( 0), CRm( 3), Op2( 2), trap_raz_wi }, + DBG_BCR_BVR_WCR_WVR(3), + DBG_BCR_BVR_WCR_WVR(4), + DBG_BCR_BVR_WCR_WVR(5), + /* DBGWFAR */ + { Op1( 0), CRn( 0), CRm( 6), Op2( 0), trap_raz_wi }, + /* DBGOSECCR */ + { Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi }, + DBG_BCR_BVR_WCR_WVR(6), + /* DBGVCR */ + { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug_regs, NULL, DBGVCR32_EL2 }, + DBG_BCR_BVR_WCR_WVR(7), + DBG_BCR_BVR_WCR_WVR(8), + DBG_BCR_BVR_WCR_WVR(9), + DBG_BCR_BVR_WCR_WVR(10), + DBG_BCR_BVR_WCR_WVR(11), + DBG_BCR_BVR_WCR_WVR(12), + DBG_BCR_BVR_WCR_WVR(13), + DBG_BCR_BVR_WCR_WVR(14), + DBG_BCR_BVR_WCR_WVR(15), + + /* DBGDRAR (32bit) */ + { Op1( 0), CRn( 1), CRm( 0), Op2( 0), trap_raz_wi }, + + DBGBXVR(0), + /* DBGOSLAR */ + { Op1( 0), CRn( 1), CRm( 0), Op2( 4), trap_oslar_el1 }, + DBGBXVR(1), + /* DBGOSLSR */ + { Op1( 0), CRn( 1), CRm( 1), Op2( 4), trap_oslsr_el1, NULL, OSLSR_EL1 }, + DBGBXVR(2), + DBGBXVR(3), + /* DBGOSDLR */ + { Op1( 0), CRn( 1), CRm( 3), Op2( 4), trap_raz_wi }, + DBGBXVR(4), + /* DBGPRCR */ + { Op1( 0), CRn( 1), CRm( 4), Op2( 4), trap_raz_wi }, + DBGBXVR(5), + DBGBXVR(6), + DBGBXVR(7), + DBGBXVR(8), + DBGBXVR(9), + DBGBXVR(10), + DBGBXVR(11), + DBGBXVR(12), + DBGBXVR(13), + DBGBXVR(14), + DBGBXVR(15), + + /* DBGDSAR (32bit) */ + { Op1( 0), CRn( 2), CRm( 0), Op2( 0), trap_raz_wi }, + + /* DBGDEVID2 */ + { Op1( 0), CRn( 7), CRm( 0), Op2( 7), trap_raz_wi }, + /* DBGDEVID1 */ + { Op1( 0), CRn( 7), CRm( 1), Op2( 7), trap_raz_wi }, + /* DBGDEVID */ + { Op1( 0), CRn( 7), CRm( 2), Op2( 7), trap_raz_wi }, + /* DBGCLAIMSET */ + { Op1( 0), CRn( 7), CRm( 8), Op2( 6), trap_raz_wi }, + /* DBGCLAIMCLR */ + { Op1( 0), CRn( 7), CRm( 9), Op2( 6), trap_raz_wi }, + /* DBGAUTHSTATUS */ + { Op1( 0), CRn( 7), CRm(14), Op2( 6), trap_dbgauthstatus_el1 }, +}; + +/* Trapped cp14 64bit registers */ +static const struct sys_reg_desc cp14_64_regs[] = { + /* DBGDRAR (64bit) */ + { Op1( 0), CRm( 1), .access = trap_raz_wi }, + + /* DBGDSAR (64bit) */ + { Op1( 0), CRm( 2), .access = trap_raz_wi }, +}; + +#define CP15_PMU_SYS_REG(_map, _Op1, _CRn, _CRm, _Op2) \ + AA32(_map), \ + Op1(_Op1), CRn(_CRn), CRm(_CRm), Op2(_Op2), \ + .visibility = pmu_visibility + +/* Macro to expand the PMEVCNTRn register */ +#define PMU_PMEVCNTR(n) \ + { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ + (0b1000 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ + .access = access_pmu_evcntr } + +/* Macro to expand the PMEVTYPERn register */ +#define PMU_PMEVTYPER(n) \ + { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ + (0b1100 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ + .access = access_pmu_evtyper } +/* + * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding, + * depending on the way they are accessed (as a 32bit or a 64bit + * register). + */ +static const struct sys_reg_desc cp15_regs[] = { + { Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr }, + { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, SCTLR_EL1 }, + /* ACTLR */ + { AA32(LO), Op1( 0), CRn( 1), CRm( 0), Op2( 1), access_actlr, NULL, ACTLR_EL1 }, + /* ACTLR2 */ + { AA32(HI), Op1( 0), CRn( 1), CRm( 0), Op2( 3), access_actlr, NULL, ACTLR_EL1 }, + { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, + { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, TTBR1_EL1 }, + /* TTBCR */ + { AA32(LO), Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, TCR_EL1 }, + /* TTBCR2 */ + { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, + { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, + { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, + /* DFSR */ + { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, + { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, + /* ADFSR */ + { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, AFSR0_EL1 }, + /* AIFSR */ + { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, AFSR1_EL1 }, + /* DFAR */ + { AA32(LO), Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, FAR_EL1 }, + /* IFAR */ + { AA32(HI), Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, FAR_EL1 }, + + /* + * DC{C,I,CI}SW operations: + */ + { Op1( 0), CRn( 7), CRm( 6), Op2( 2), access_dcsw }, + { Op1( 0), CRn( 7), CRm(10), Op2( 2), access_dcsw }, + { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw }, + + /* PMU */ + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 0), .access = access_pmcr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 1), .access = access_pmcnten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 2), .access = access_pmcnten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 3), .access = access_pmovs }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 4), .access = access_pmswinc }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 5), .access = access_pmselr }, + { CP15_PMU_SYS_REG(LO, 0, 9, 12, 6), .access = access_pmceid }, + { CP15_PMU_SYS_REG(LO, 0, 9, 12, 7), .access = access_pmceid }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 0), .access = access_pmu_evcntr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 1), .access = access_pmu_evtyper }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 2), .access = access_pmu_evcntr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 0), .access = access_pmuserenr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 1), .access = access_pminten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 2), .access = access_pminten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 3), .access = access_pmovs }, + { CP15_PMU_SYS_REG(HI, 0, 9, 14, 4), .access = access_pmceid }, + { CP15_PMU_SYS_REG(HI, 0, 9, 14, 5), .access = access_pmceid }, + /* PMMIR */ + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 6), .access = trap_raz_wi }, + + /* PRRR/MAIR0 */ + { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 }, + /* NMRR/MAIR1 */ + { AA32(HI), Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, MAIR_EL1 }, + /* AMAIR0 */ + { AA32(LO), Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, AMAIR_EL1 }, + /* AMAIR1 */ + { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, + + { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, + { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, + + { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, + + /* Arch Tmers */ + { SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTP_CTL), access_arch_timer }, + + /* PMEVCNTRn */ + PMU_PMEVCNTR(0), + PMU_PMEVCNTR(1), + PMU_PMEVCNTR(2), + PMU_PMEVCNTR(3), + PMU_PMEVCNTR(4), + PMU_PMEVCNTR(5), + PMU_PMEVCNTR(6), + PMU_PMEVCNTR(7), + PMU_PMEVCNTR(8), + PMU_PMEVCNTR(9), + PMU_PMEVCNTR(10), + PMU_PMEVCNTR(11), + PMU_PMEVCNTR(12), + PMU_PMEVCNTR(13), + PMU_PMEVCNTR(14), + PMU_PMEVCNTR(15), + PMU_PMEVCNTR(16), + PMU_PMEVCNTR(17), + PMU_PMEVCNTR(18), + PMU_PMEVCNTR(19), + PMU_PMEVCNTR(20), + PMU_PMEVCNTR(21), + PMU_PMEVCNTR(22), + PMU_PMEVCNTR(23), + PMU_PMEVCNTR(24), + PMU_PMEVCNTR(25), + PMU_PMEVCNTR(26), + PMU_PMEVCNTR(27), + PMU_PMEVCNTR(28), + PMU_PMEVCNTR(29), + PMU_PMEVCNTR(30), + /* PMEVTYPERn */ + PMU_PMEVTYPER(0), + PMU_PMEVTYPER(1), + PMU_PMEVTYPER(2), + PMU_PMEVTYPER(3), + PMU_PMEVTYPER(4), + PMU_PMEVTYPER(5), + PMU_PMEVTYPER(6), + PMU_PMEVTYPER(7), + PMU_PMEVTYPER(8), + PMU_PMEVTYPER(9), + PMU_PMEVTYPER(10), + PMU_PMEVTYPER(11), + PMU_PMEVTYPER(12), + PMU_PMEVTYPER(13), + PMU_PMEVTYPER(14), + PMU_PMEVTYPER(15), + PMU_PMEVTYPER(16), + PMU_PMEVTYPER(17), + PMU_PMEVTYPER(18), + PMU_PMEVTYPER(19), + PMU_PMEVTYPER(20), + PMU_PMEVTYPER(21), + PMU_PMEVTYPER(22), + PMU_PMEVTYPER(23), + PMU_PMEVTYPER(24), + PMU_PMEVTYPER(25), + PMU_PMEVTYPER(26), + PMU_PMEVTYPER(27), + PMU_PMEVTYPER(28), + PMU_PMEVTYPER(29), + PMU_PMEVTYPER(30), + /* PMCCFILTR */ + { CP15_PMU_SYS_REG(DIRECT, 0, 14, 15, 7), .access = access_pmu_evtyper }, + + { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, + { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, + + /* CCSIDR2 */ + { Op1(1), CRn( 0), CRm( 0), Op2(2), undef_access }, + + { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 }, +}; + +static const struct sys_reg_desc cp15_64_regs[] = { + { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, + { CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr }, + { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */ + { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer }, + { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, + { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ + { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer }, + { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ + { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer }, +}; + +static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, + bool reset_check) { unsigned int i; - for (i = 0; i < num; i++) { - const struct sys_reg_desc *r = &table[i]; - - if (params->Op0 != r->Op0) - continue; - if (params->Op1 != r->Op1) - continue; - if (params->CRn != r->CRn) - continue; - if (params->CRm != r->CRm) - continue; - if (params->Op2 != r->Op2) - continue; + for (i = 0; i < n; i++) { + if (reset_check && table[i].reg && !table[i].reset) { + kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n", + &table[i], i, table[i].name); + return false; + } - return r; + if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { + kvm_err("sys_reg table %pS entry %d (%s -> %s) out of order\n", + &table[i], i, table[i - 1].name, table[i].name); + return false; + } } - return NULL; + + return true; } -int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); return 1; } -int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run) +static void perform_access(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r) { - kvm_inject_undefined(vcpu); - return 1; + trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); + + /* Check for regs disabled by runtime config */ + if (sysreg_hidden(vcpu, r)) { + kvm_inject_undefined(vcpu); + return; + } + + /* + * Not having an accessor means that we have configured a trap + * that we don't know how to handle. This certainly qualifies + * as a gross bug that should be fixed right away. + */ + BUG_ON(!r->access); + + /* Skip instruction if instructed so */ + if (likely(r->access(vcpu, params, r))) + kvm_incr_pc(vcpu); } -static void emulate_cp15(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) +/* + * emulate_cp -- tries to match a sys_reg access in a handling table, and + * call the corresponding trap handler. + * + * @params: pointer to the descriptor of the access + * @table: array of trap descriptors + * @num: size of the trap descriptor array + * + * Return true if the access has been handled, false if not. + */ +static bool emulate_cp(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *table, + size_t num) { - size_t num; - const struct sys_reg_desc *table, *r; + const struct sys_reg_desc *r; - table = get_target_table(vcpu->arch.target, false, &num); + if (!table) + return false; /* Not handled */ - /* Search target-specific then generic table. */ r = find_reg(params, table, num); - if (!r) - r = find_reg(params, cp15_regs, ARRAY_SIZE(cp15_regs)); - if (likely(r)) { - /* - * Not having an accessor means that we have - * configured a trap that we don't know how to - * handle. This certainly qualifies as a gross bug - * that should be fixed right away. - */ - BUG_ON(!r->access); + if (r) { + perform_access(vcpu, params, r); + return true; + } - if (likely(r->access(vcpu, params, r))) { - /* Skip instruction, since it was emulated */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - return; - } - /* If access function fails, it should complain. */ + /* Not handled */ + return false; +} + +static void unhandled_cp_access(struct kvm_vcpu *vcpu, + struct sys_reg_params *params) +{ + u8 esr_ec = kvm_vcpu_trap_get_class(vcpu); + int cp = -1; + + switch (esr_ec) { + case ESR_ELx_EC_CP15_32: + case ESR_ELx_EC_CP15_64: + cp = 15; + break; + case ESR_ELx_EC_CP14_MR: + case ESR_ELx_EC_CP14_64: + cp = 14; + break; + default: + WARN_ON(1); } - kvm_err("Unsupported guest CP15 access at: %08lx\n", *vcpu_pc(vcpu)); - print_sys_reg_instr(params); + print_sys_reg_msg(params, + "Unsupported guest CP%d access at: %08lx [%08lx]\n", + cp, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); kvm_inject_undefined(vcpu); } /** - * kvm_handle_cp15_64 -- handles a mrrc/mcrr trap on a guest CP15 access + * kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer - * @run: The kvm_run struct + * @global: &struct sys_reg_desc + * @nr_global: size of the @global array */ -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *global, + size_t nr_global) { struct sys_reg_params params; - u32 hsr = kvm_vcpu_get_hsr(vcpu); - int Rt2 = (hsr >> 10) & 0xf; + u64 esr = kvm_vcpu_get_esr(vcpu); + int Rt = kvm_vcpu_sys_get_rt(vcpu); + int Rt2 = (esr >> 10) & 0x1f; - params.CRm = (hsr >> 1) & 0xf; - params.Rt = (hsr >> 5) & 0xf; - params.is_write = ((hsr & 1) == 0); + params.CRm = (esr >> 1) & 0xf; + params.is_write = ((esr & 1) == 0); params.Op0 = 0; - params.Op1 = (hsr >> 16) & 0xf; + params.Op1 = (esr >> 16) & 0xf; params.Op2 = 0; params.CRn = 0; /* - * Massive hack here. Store Rt2 in the top 32bits so we only - * have one register to deal with. As we use the same trap + * Make a 64-bit value out of Rt and Rt2. As we use the same trap * backends between AArch32 and AArch64, we get away with it. */ if (params.is_write) { - u64 val = *vcpu_reg(vcpu, params.Rt); - val &= 0xffffffff; - val |= *vcpu_reg(vcpu, Rt2) << 32; - *vcpu_reg(vcpu, params.Rt) = val; + params.regval = vcpu_get_reg(vcpu, Rt) & 0xffffffff; + params.regval |= vcpu_get_reg(vcpu, Rt2) << 32; + } + + /* + * If the table contains a handler, handle the + * potential register operation in the case of a read and return + * with success. + */ + if (emulate_cp(vcpu, ¶ms, global, nr_global)) { + /* Split up the value between registers for the read side */ + if (!params.is_write) { + vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval)); + vcpu_set_reg(vcpu, Rt2, upper_32_bits(params.regval)); + } + + return 1; + } + + unhandled_cp_access(vcpu, ¶ms); + return 1; +} + +static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params); + +/* + * The CP10 ID registers are architecturally mapped to AArch64 feature + * registers. Abuse that fact so we can rely on the AArch64 handler for accesses + * from AArch32. + */ +static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params) +{ + u8 reg_id = (esr >> 10) & 0xf; + bool valid; + + params->is_write = ((esr & 1) == 0); + params->Op0 = 3; + params->Op1 = 0; + params->CRn = 0; + params->CRm = 3; + + /* CP10 ID registers are read-only */ + valid = !params->is_write; + + switch (reg_id) { + /* MVFR0 */ + case 0b0111: + params->Op2 = 0; + break; + /* MVFR1 */ + case 0b0110: + params->Op2 = 1; + break; + /* MVFR2 */ + case 0b0101: + params->Op2 = 2; + break; + default: + valid = false; + } + + if (valid) + return true; + + kvm_pr_unimpl("Unhandled cp10 register %s: %u\n", + str_write_read(params->is_write), reg_id); + return false; +} + +/** + * kvm_handle_cp10_id() - Handles a VMRS trap on guest access to a 'Media and + * VFP Register' from AArch32. + * @vcpu: The vCPU pointer + * + * MVFR{0-2} are architecturally mapped to the AArch64 MVFR{0-2}_EL1 registers. + * Work out the correct AArch64 system register encoding and reroute to the + * AArch64 system register emulation. + */ +int kvm_handle_cp10_id(struct kvm_vcpu *vcpu) +{ + int Rt = kvm_vcpu_sys_get_rt(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); + struct sys_reg_params params; + + /* UNDEF on any unhandled register access */ + if (!kvm_esr_cp10_id_to_sys64(esr, ¶ms)) { + kvm_inject_undefined(vcpu); + return 1; } - emulate_cp15(vcpu, ¶ms); + if (emulate_sys_reg(vcpu, ¶ms)) + vcpu_set_reg(vcpu, Rt, params.regval); + + return 1; +} + +/** + * kvm_emulate_cp15_id_reg() - Handles an MRC trap on a guest CP15 access where + * CRn=0, which corresponds to the AArch32 feature + * registers. + * @vcpu: the vCPU pointer + * @params: the system register access parameters. + * + * Our cp15 system register tables do not enumerate the AArch32 feature + * registers. Conveniently, our AArch64 table does, and the AArch32 system + * register encoding can be trivially remapped into the AArch64 for the feature + * registers: Append op0=3, leaving op1, CRn, CRm, and op2 the same. + * + * According to DDI0487G.b G7.3.1, paragraph "Behavior of VMSAv8-32 32-bit + * System registers with (coproc=0b1111, CRn==c0)", read accesses from this + * range are either UNKNOWN or RES0. Rerouting remains architectural as we + * treat undefined registers in this range as RAZ. + */ +static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *params) +{ + int Rt = kvm_vcpu_sys_get_rt(vcpu); - /* Do the opposite hack for the read side */ - if (!params.is_write) { - u64 val = *vcpu_reg(vcpu, params.Rt); - val >>= 32; - *vcpu_reg(vcpu, Rt2) = val; + /* Treat impossible writes to RO registers as UNDEFINED */ + if (params->is_write) { + unhandled_cp_access(vcpu, params); + return 1; } + params->Op0 = 3; + + /* + * All registers where CRm > 3 are known to be UNKNOWN/RAZ from AArch32. + * Avoid conflicting with future expansion of AArch64 feature registers + * and simply treat them as RAZ here. + */ + if (params->CRm > 3) + params->regval = 0; + else if (!emulate_sys_reg(vcpu, params)) + return 1; + + vcpu_set_reg(vcpu, Rt, params->regval); return 1; } /** - * kvm_handle_cp15_32 -- handles a mrc/mcr trap on a guest CP15 access + * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer - * @run: The kvm_run struct + * @params: &struct sys_reg_params + * @global: &struct sys_reg_desc + * @nr_global: size of the @global array */ -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *global, + size_t nr_global) { - struct sys_reg_params params; - u32 hsr = kvm_vcpu_get_hsr(vcpu); + int Rt = kvm_vcpu_sys_get_rt(vcpu); - params.CRm = (hsr >> 1) & 0xf; - params.Rt = (hsr >> 5) & 0xf; - params.is_write = ((hsr & 1) == 0); - params.CRn = (hsr >> 10) & 0xf; - params.Op0 = 0; - params.Op1 = (hsr >> 14) & 0x7; - params.Op2 = (hsr >> 17) & 0x7; + params->regval = vcpu_get_reg(vcpu, Rt); + + if (emulate_cp(vcpu, params, global, nr_global)) { + if (!params->is_write) + vcpu_set_reg(vcpu, Rt, params->regval); + return 1; + } - emulate_cp15(vcpu, ¶ms); + unhandled_cp_access(vcpu, params); return 1; } -static int emulate_sys_reg(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu) { - size_t num; - const struct sys_reg_desc *table, *r; + return kvm_handle_cp_64(vcpu, cp15_64_regs, ARRAY_SIZE(cp15_64_regs)); +} - table = get_target_table(vcpu->arch.target, true, &num); +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) +{ + struct sys_reg_params params; - /* Search target-specific then generic table. */ - r = find_reg(params, table, num); - if (!r) - r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); + + /* + * Certain AArch32 ID registers are handled by rerouting to the AArch64 + * system register table. Registers in the ID range where CRm=0 are + * excluded from this scheme as they do not trivially map into AArch64 + * system register encodings, except for AIDR/REVIDR. + */ + if (params.Op1 == 0 && params.CRn == 0 && + (params.CRm || params.Op2 == 6 /* REVIDR */)) + return kvm_emulate_cp15_id_reg(vcpu, ¶ms); + if (params.Op1 == 1 && params.CRn == 0 && + params.CRm == 0 && params.Op2 == 7 /* AIDR */) + return kvm_emulate_cp15_id_reg(vcpu, ¶ms); + + return kvm_handle_cp_32(vcpu, ¶ms, cp15_regs, ARRAY_SIZE(cp15_regs)); +} + +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu) +{ + return kvm_handle_cp_64(vcpu, cp14_64_regs, ARRAY_SIZE(cp14_64_regs)); +} + +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu) +{ + struct sys_reg_params params; + params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); + + return kvm_handle_cp_32(vcpu, ¶ms, cp14_regs, ARRAY_SIZE(cp14_regs)); +} + +/** + * emulate_sys_reg - Emulate a guest access to an AArch64 system register + * @vcpu: The VCPU pointer + * @params: Decoded system register parameters + * + * Return: true if the system register access was successful, false otherwise. + */ +static bool emulate_sys_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *params) +{ + const struct sys_reg_desc *r; + + r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); if (likely(r)) { - /* - * Not having an accessor means that we have - * configured a trap that we don't know how to - * handle. This certainly qualifies as a gross bug - * that should be fixed right away. - */ - BUG_ON(!r->access); + perform_access(vcpu, params, r); + return true; + } - if (likely(r->access(vcpu, params, r))) { - /* Skip instruction, since it was emulated */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - return 1; - } - /* If access function fails, it should complain. */ + print_sys_reg_msg(params, + "Unsupported guest sys_reg access at: %lx [%08lx]\n", + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); + kvm_inject_undefined(vcpu); + + return false; +} + +static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos) +{ + unsigned long i, idreg_idx = 0; + + for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *r = &sys_reg_descs[i]; + + if (!is_vm_ftr_id_reg(reg_to_encoding(r))) + continue; + + if (idreg_idx == pos) + return r; + + idreg_idx++; + } + + return NULL; +} + +static void *idregs_debug_start(struct seq_file *s, loff_t *pos) +{ + struct kvm *kvm = s->private; + u8 *iter; + + mutex_lock(&kvm->arch.config_lock); + + iter = &kvm->arch.idreg_debugfs_iter; + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) && + *iter == (u8)~0) { + *iter = *pos; + if (!idregs_debug_find(kvm, *iter)) + iter = NULL; } else { - kvm_err("Unsupported guest sys_reg access at: %lx\n", - *vcpu_pc(vcpu)); - print_sys_reg_instr(params); + iter = ERR_PTR(-EBUSY); } - kvm_inject_undefined(vcpu); - return 1; + + mutex_unlock(&kvm->arch.config_lock); + + return iter; +} + +static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kvm *kvm = s->private; + + (*pos)++; + + if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) { + kvm->arch.idreg_debugfs_iter++; + + return &kvm->arch.idreg_debugfs_iter; + } + + return NULL; } -static void reset_sys_reg_descs(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *table, size_t num) +static void idregs_debug_stop(struct seq_file *s, void *v) { + struct kvm *kvm = s->private; + + if (IS_ERR(v)) + return; + + mutex_lock(&kvm->arch.config_lock); + + kvm->arch.idreg_debugfs_iter = ~0; + + mutex_unlock(&kvm->arch.config_lock); +} + +static int idregs_debug_show(struct seq_file *s, void *v) +{ + const struct sys_reg_desc *desc; + struct kvm *kvm = s->private; + + desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter); + + if (!desc->name) + return 0; + + seq_printf(s, "%20s:\t%016llx\n", + desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc))); + + return 0; +} + +static const struct seq_operations idregs_debug_sops = { + .start = idregs_debug_start, + .next = idregs_debug_next, + .stop = idregs_debug_stop, + .show = idregs_debug_show, +}; + +DEFINE_SEQ_ATTRIBUTE(idregs_debug); + +void kvm_sys_regs_create_debugfs(struct kvm *kvm) +{ + kvm->arch.idreg_debugfs_iter = ~0; + + debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm, + &idregs_debug_fops); +} + +static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg) +{ + u32 id = reg_to_encoding(reg); + struct kvm *kvm = vcpu->kvm; + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg)); +} + +static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *reg) +{ + if (kvm_vcpu_initialized(vcpu)) + return; + + reg->reset(vcpu, reg); +} + +/** + * kvm_reset_sys_regs - sets system registers to reset value + * @vcpu: The VCPU pointer + * + * This function finds the right table above and sets the registers on the + * virtual CPU struct to their architecturally defined reset values. + */ +void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; unsigned long i; - for (i = 0; i < num; i++) - if (table[i].reset) - table[i].reset(vcpu, &table[i]); + for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *r = &sys_reg_descs[i]; + + if (!r->reset) + continue; + + if (is_vm_ftr_id_reg(reg_to_encoding(r))) + reset_vm_ftr_id_reg(vcpu, r); + else if (is_vcpu_ftr_id_reg(reg_to_encoding(r))) + reset_vcpu_ftr_id_reg(vcpu, r); + else + r->reset(vcpu, r); + + if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) + __vcpu_rmw_sys_reg(vcpu, r->reg, |=, 0); + } + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); } /** - * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access + * kvm_handle_sys_reg -- handles a system instruction or mrs/msr instruction + * trap on a guest execution * @vcpu: The VCPU pointer - * @run: The kvm_run struct */ -int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) { + const struct sys_reg_desc *desc = NULL; struct sys_reg_params params; - unsigned long esr = kvm_vcpu_get_hsr(vcpu); + unsigned long esr = kvm_vcpu_get_esr(vcpu); + int Rt = kvm_vcpu_sys_get_rt(vcpu); + int sr_idx; - params.Op0 = (esr >> 20) & 3; - params.Op1 = (esr >> 14) & 0x7; - params.CRn = (esr >> 10) & 0xf; - params.CRm = (esr >> 1) & 0xf; - params.Op2 = (esr >> 17) & 0x7; - params.Rt = (esr >> 5) & 0x1f; - params.is_write = !(esr & 1); + trace_kvm_handle_sys_reg(esr); + + if (triage_sysreg_trap(vcpu, &sr_idx)) + return 1; + + params = esr_sys64_to_params(esr); + params.regval = vcpu_get_reg(vcpu, Rt); + + /* System registers have Op0=={2,3}, as per DDI487 J.a C5.1.2 */ + if (params.Op0 == 2 || params.Op0 == 3) + desc = &sys_reg_descs[sr_idx]; + else + desc = &sys_insn_descs[sr_idx]; + + perform_access(vcpu, ¶ms, desc); - return emulate_sys_reg(vcpu, ¶ms); + /* Read from system register? */ + if (!params.is_write && + (params.Op0 == 2 || params.Op0 == 3)) + vcpu_set_reg(vcpu, Rt, params.regval); + + return 1; } /****************************************************************************** @@ -590,196 +5220,40 @@ static bool index_to_params(u64 id, struct sys_reg_params *params) } } -/* Decode an index value, and find the sys_reg_desc entry. */ -static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, - u64 id) +const struct sys_reg_desc *get_reg_by_id(u64 id, + const struct sys_reg_desc table[], + unsigned int num) { - size_t num; - const struct sys_reg_desc *table, *r; struct sys_reg_params params; - /* We only do sys_reg for now. */ - if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) - return NULL; - if (!index_to_params(id, ¶ms)) return NULL; - table = get_target_table(vcpu->arch.target, true, &num); - r = find_reg(¶ms, table, num); - if (!r) - r = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); - - /* Not saved in the sys_reg array? */ - if (r && !r->reg) - r = NULL; - - return r; + return find_reg(¶ms, table, num); } -/* - * These are the invariant sys_reg registers: we let the guest see the - * host versions of these, so they're part of the guest state. - * - * A future CPU may provide a mechanism to present different values to - * the guest, or a future kvm may trap them. - */ - -#define FUNCTION_INVARIANT(reg) \ - static void get_##reg(struct kvm_vcpu *v, \ - const struct sys_reg_desc *r) \ - { \ - u64 val; \ - \ - asm volatile("mrs %0, " __stringify(reg) "\n" \ - : "=r" (val)); \ - ((struct sys_reg_desc *)r)->val = val; \ - } - -FUNCTION_INVARIANT(midr_el1) -FUNCTION_INVARIANT(ctr_el0) -FUNCTION_INVARIANT(revidr_el1) -FUNCTION_INVARIANT(id_pfr0_el1) -FUNCTION_INVARIANT(id_pfr1_el1) -FUNCTION_INVARIANT(id_dfr0_el1) -FUNCTION_INVARIANT(id_afr0_el1) -FUNCTION_INVARIANT(id_mmfr0_el1) -FUNCTION_INVARIANT(id_mmfr1_el1) -FUNCTION_INVARIANT(id_mmfr2_el1) -FUNCTION_INVARIANT(id_mmfr3_el1) -FUNCTION_INVARIANT(id_isar0_el1) -FUNCTION_INVARIANT(id_isar1_el1) -FUNCTION_INVARIANT(id_isar2_el1) -FUNCTION_INVARIANT(id_isar3_el1) -FUNCTION_INVARIANT(id_isar4_el1) -FUNCTION_INVARIANT(id_isar5_el1) -FUNCTION_INVARIANT(clidr_el1) -FUNCTION_INVARIANT(aidr_el1) - -/* ->val is filled in by kvm_sys_reg_table_init() */ -static struct sys_reg_desc invariant_sys_regs[] = { - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b000), - NULL, get_midr_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b110), - NULL, get_revidr_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b000), - NULL, get_id_pfr0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b001), - NULL, get_id_pfr1_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b010), - NULL, get_id_dfr0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b011), - NULL, get_id_afr0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b100), - NULL, get_id_mmfr0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b101), - NULL, get_id_mmfr1_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b110), - NULL, get_id_mmfr2_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b111), - NULL, get_id_mmfr3_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b000), - NULL, get_id_isar0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b001), - NULL, get_id_isar1_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b010), - NULL, get_id_isar2_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b011), - NULL, get_id_isar3_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b100), - NULL, get_id_isar4_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b101), - NULL, get_id_isar5_el1 }, - { Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b001), - NULL, get_clidr_el1 }, - { Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b111), - NULL, get_aidr_el1 }, - { Op0(0b11), Op1(0b011), CRn(0b0000), CRm(0b0000), Op2(0b001), - NULL, get_ctr_el0 }, -}; - -static int reg_from_user(void *val, const void __user *uaddr, u64 id) -{ - /* This Just Works because we are little endian. */ - if (copy_from_user(val, uaddr, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -static int reg_to_user(void __user *uaddr, const void *val, u64 id) -{ - /* This Just Works because we are little endian. */ - if (copy_to_user(uaddr, val, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -static int get_invariant_sys_reg(u64 id, void __user *uaddr) -{ - struct sys_reg_params params; - const struct sys_reg_desc *r; - - if (!index_to_params(id, ¶ms)) - return -ENOENT; - - r = find_reg(¶ms, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)); - if (!r) - return -ENOENT; - - return reg_to_user(uaddr, &r->val, id); -} +/* Decode an index value, and find the sys_reg_desc entry. */ +static const struct sys_reg_desc * +id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, + const struct sys_reg_desc table[], unsigned int num) -static int set_invariant_sys_reg(u64 id, void __user *uaddr) { - struct sys_reg_params params; const struct sys_reg_desc *r; - int err; - u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */ - - if (!index_to_params(id, ¶ms)) - return -ENOENT; - r = find_reg(¶ms, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)); - if (!r) - return -ENOENT; - err = reg_from_user(&val, uaddr, id); - if (err) - return err; - - /* This is what we mean by invariant: you can't change it. */ - if (r->val != val) - return -EINVAL; - - return 0; -} - -static bool is_valid_cache(u32 val) -{ - u32 level, ctype; + /* We only do sys_reg for now. */ + if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) + return NULL; - if (val >= CSSELR_MAX) - return -ENOENT; + r = get_reg_by_id(id, table, num); - /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */ - level = (val >> 1); - ctype = (cache_levels >> (level * 3)) & 7; + /* Not saved in the sys_reg array and not otherwise accessible? */ + if (r && (!(r->reg || r->get_user) || sysreg_hidden(vcpu, r))) + r = NULL; - switch (ctype) { - case 0: /* No cache */ - return false; - case 1: /* Instruction cache only */ - return (val & 1); - case 2: /* Data cache only */ - case 4: /* Unified cache */ - return !(val & 1); - case 3: /* Separate instruction and data caches */ - return true; - default: /* Reserved: we can't know instruction or data. */ - return false; - } + return r; } -static int demux_c15_get(u64 id, void __user *uaddr) +static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val; u32 __user *uval = uaddr; @@ -795,16 +5269,16 @@ static int demux_c15_get(u64 id, void __user *uaddr) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; - if (!is_valid_cache(val)) + if (val >= CSSELR_MAX) return -ENOENT; - return put_user(get_ccsidr(val), uval); + return put_user(get_ccsidr(vcpu, val), uval); default: return -ENOENT; } } -static int demux_c15_set(u64 id, void __user *uaddr) +static int demux_c15_set(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val, newval; u32 __user *uval = uaddr; @@ -820,77 +5294,119 @@ static int demux_c15_set(u64 id, void __user *uaddr) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; - if (!is_valid_cache(val)) + if (val >= CSSELR_MAX) return -ENOENT; if (get_user(newval, uval)) return -EFAULT; - /* This is also invariant: you can't change it. */ - if (newval != get_ccsidr(val)) - return -EINVAL; - return 0; + return set_ccsidr(vcpu, val, newval); default: return -ENOENT; } } -int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +static u64 kvm_one_reg_to_id(const struct kvm_one_reg *reg) { - const struct sys_reg_desc *r; - void __user *uaddr = (void __user *)(unsigned long)reg->addr; + switch(reg->id) { + case KVM_REG_ARM_TIMER_CVAL: + return TO_ARM64_SYS_REG(CNTV_CVAL_EL0); + case KVM_REG_ARM_TIMER_CNT: + return TO_ARM64_SYS_REG(CNTVCT_EL0); + default: + return reg->id; + } +} - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) - return demux_c15_get(reg->id, uaddr); +int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; + const struct sys_reg_desc *r; + u64 id = kvm_one_reg_to_id(reg); + u64 val; + int ret; - if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) + r = id_to_sys_reg_desc(vcpu, id, table, num); + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; - r = index_to_sys_reg_desc(vcpu, reg->id); - if (!r) - return get_invariant_sys_reg(reg->id, uaddr); + if (r->get_user) { + ret = (r->get_user)(vcpu, r, &val); + } else { + val = __vcpu_sys_reg(vcpu, r->reg); + ret = 0; + } + + if (!ret) + ret = put_user(val, uaddr); - return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id); + return ret; } -int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { - const struct sys_reg_desc *r; void __user *uaddr = (void __user *)(unsigned long)reg->addr; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) - return demux_c15_set(reg->id, uaddr); + return demux_c15_get(vcpu, reg->id, uaddr); + + return kvm_sys_reg_get_user(vcpu, reg, + sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); +} + +int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; + const struct sys_reg_desc *r; + u64 id = kvm_one_reg_to_id(reg); + u64 val; + int ret; + + if (get_user(val, uaddr)) + return -EFAULT; - if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) + r = id_to_sys_reg_desc(vcpu, id, table, num); + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; - r = index_to_sys_reg_desc(vcpu, reg->id); - if (!r) - return set_invariant_sys_reg(reg->id, uaddr); + if (sysreg_user_write_ignore(vcpu, r)) + return 0; + + if (r->set_user) { + ret = (r->set_user)(vcpu, r, val); + } else { + __vcpu_assign_sys_reg(vcpu, r->reg, val); + ret = 0; + } - return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id); + return ret; } -static unsigned int num_demux_regs(void) +int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { - unsigned int i, count = 0; + void __user *uaddr = (void __user *)(unsigned long)reg->addr; - for (i = 0; i < CSSELR_MAX; i++) - if (is_valid_cache(i)) - count++; + if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) + return demux_c15_set(vcpu, reg->id, uaddr); - return count; + return kvm_sys_reg_set_user(vcpu, reg, + sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); +} + +static unsigned int num_demux_regs(void) +{ + return CSSELR_MAX; } static int write_demux_regids(u64 __user *uindices) { - u64 val = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX; + u64 val = KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX; unsigned int i; val |= KVM_REG_ARM_DEMUX_ID_CCSIDR; for (i = 0; i < CSSELR_MAX; i++) { - if (!is_valid_cache(i)) - continue; if (put_user(val | i, uindices)) return -EFAULT; uindices++; @@ -911,78 +5427,79 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg) static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) { + u64 idx; + if (!*uind) return true; - if (put_user(sys_reg_to_index(reg), *uind)) + switch (reg_to_encoding(reg)) { + case SYS_CNTV_CVAL_EL0: + idx = KVM_REG_ARM_TIMER_CVAL; + break; + case SYS_CNTVCT_EL0: + idx = KVM_REG_ARM_TIMER_CNT; + break; + default: + idx = sys_reg_to_index(reg); + } + + if (put_user(idx, *uind)) return false; (*uind)++; return true; } +static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 __user **uind, + unsigned int *total) +{ + /* + * Ignore registers we trap but don't save, + * and for which no custom user accessor is provided. + */ + if (!(rd->reg || rd->get_user)) + return 0; + + if (sysreg_hidden(vcpu, rd)) + return 0; + + if (!copy_reg_to_user(rd, uind)) + return -EFAULT; + + (*total)++; + return 0; +} + /* Assumed ordered tables, see kvm_sys_reg_table_init. */ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) { - const struct sys_reg_desc *i1, *i2, *end1, *end2; + const struct sys_reg_desc *i2, *end2; unsigned int total = 0; - size_t num; + int err; - /* We check for duplicates here, to allow arch-specific overrides. */ - i1 = get_target_table(vcpu->arch.target, true, &num); - end1 = i1 + num; i2 = sys_reg_descs; end2 = sys_reg_descs + ARRAY_SIZE(sys_reg_descs); - BUG_ON(i1 == end1 || i2 == end2); - - /* Walk carefully, as both tables may refer to the same register. */ - while (i1 || i2) { - int cmp = cmp_sys_reg(i1, i2); - /* target-specific overrides generic entry. */ - if (cmp <= 0) { - /* Ignore registers we trap but don't save. */ - if (i1->reg) { - if (!copy_reg_to_user(i1, &uind)) - return -EFAULT; - total++; - } - } else { - /* Ignore registers we trap but don't save. */ - if (i2->reg) { - if (!copy_reg_to_user(i2, &uind)) - return -EFAULT; - total++; - } - } - - if (cmp <= 0 && ++i1 == end1) - i1 = NULL; - if (cmp >= 0 && ++i2 == end2) - i2 = NULL; + while (i2 != end2) { + err = walk_one_sys_reg(vcpu, i2++, &uind, &total); + if (err) + return err; } return total; } unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu) { - return ARRAY_SIZE(invariant_sys_regs) - + num_demux_regs() + return num_demux_regs() + walk_sys_regs(vcpu, (u64 __user *)NULL); } int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - unsigned int i; int err; - /* Then give them all the invariant registers' indices. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) { - if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) - return -EFAULT; - uindices++; - } - err = walk_sys_regs(vcpu, uindices); if (err < 0) return err; @@ -991,60 +5508,179 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } -void kvm_sys_reg_table_init(void) +#define KVM_ARM_FEATURE_ID_RANGE_INDEX(r) \ + KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(r), \ + sys_reg_Op1(r), \ + sys_reg_CRn(r), \ + sys_reg_CRm(r), \ + sys_reg_Op2(r)) + +int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range) { - unsigned int i; - struct sys_reg_desc clidr; + const void *zero_page = page_to_virt(ZERO_PAGE(0)); + u64 __user *masks = (u64 __user *)range->addr; - /* Make sure tables are unique and in order. */ - for (i = 1; i < ARRAY_SIZE(sys_reg_descs); i++) - BUG_ON(cmp_sys_reg(&sys_reg_descs[i-1], &sys_reg_descs[i]) >= 0); + /* Only feature id range is supported, reserved[13] must be zero. */ + if (range->range || + memcmp(range->reserved, zero_page, sizeof(range->reserved))) + return -EINVAL; + + /* Wipe the whole thing first */ + if (clear_user(masks, KVM_ARM_FEATURE_ID_RANGE_SIZE * sizeof(__u64))) + return -EFAULT; + + for (int i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *reg = &sys_reg_descs[i]; + u32 encoding = reg_to_encoding(reg); + u64 val; + + if (!is_feature_id_reg(encoding) || !reg->set_user) + continue; + + if (!reg->val || + (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) { + continue; + } + val = reg->val; + + if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding)))) + return -EFAULT; + } + + return 0; +} + +static void vcpu_set_hcr(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + if (has_vhe() || has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_el1_is_32bit(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_RW; - /* We abuse the reset function to overwrite the table itself. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) - invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; /* - * CLIDR format is awkward, so clean it up. See ARM B4.1.20: - * - * If software reads the Cache Type fields from Ctype1 - * upwards, once it has seen a value of 0b000, no caches - * exist at further-out levels of the hierarchy. So, for - * example, if Ctype3 is the first Cache Type field with a - * value of 0b000, the values of Ctype4 to Ctype7 must be - * ignored. + * In the absence of FGT, we cannot independently trap TLBI + * Range instructions. This isn't great, but trapping all + * TLBIs would be far worse. Live with it... */ - get_clidr_el1(NULL, &clidr); /* Ugly... */ - cache_levels = clidr.val; - for (i = 0; i < 7; i++) - if (((cache_levels >> (i*3)) & 7) == 0) - break; - /* Clear all higher bits. */ - cache_levels &= (1 << (i*3))-1; + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + vcpu->arch.hcr_el2 |= HCR_TTLBOS; } -/** - * kvm_reset_sys_regs - sets system registers to reset value - * @vcpu: The VCPU pointer +void kvm_calculate_traps(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->arch.config_lock); + vcpu_set_hcr(vcpu); + vcpu_set_ich_hcr(vcpu); + vcpu_set_hcrx(vcpu); + + if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) + goto out; + + compute_fgu(kvm, HFGRTR_GROUP); + compute_fgu(kvm, HFGITR_GROUP); + compute_fgu(kvm, HDFGRTR_GROUP); + compute_fgu(kvm, HAFGRTR_GROUP); + compute_fgu(kvm, HFGRTR2_GROUP); + compute_fgu(kvm, HFGITR2_GROUP); + compute_fgu(kvm, HDFGRTR2_GROUP); + + set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); +out: + mutex_unlock(&kvm->arch.config_lock); +} + +/* + * Perform last adjustments to the ID registers that are implied by the + * configuration outside of the ID regs themselves, as well as any + * initialisation that directly depend on these ID registers (such as + * RES0/RES1 behaviours). This is not the place to configure traps though. * - * This function finds the right table above and sets the registers on the - * virtual CPU struct to their architecturally defined reset values. + * Because this can be called once per CPU, changes must be idempotent. */ -void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) { - size_t num; - const struct sys_reg_desc *table; + struct kvm *kvm = vcpu->kvm; + + guard(mutex)(&kvm->arch.config_lock); + + /* + * This hacks into the ID registers, so only perform it when the + * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream. + */ + if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) { + u64 val; + + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); + val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val); + } + + if (vcpu_has_nv(vcpu)) { + int ret = kvm_init_nv_sysregs(vcpu); + if (ret) + return ret; + } + + return 0; +} + +int __init kvm_sys_reg_table_init(void) +{ + const struct sys_reg_desc *gicv3_regs; + bool valid = true; + unsigned int i, sz; + int ret = 0; + + /* Make sure tables are unique and in order. */ + valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), true); + valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), false); + valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), false); + valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), false); + valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), false); + valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false); + + gicv3_regs = vgic_v3_get_sysreg_table(&sz); + valid &= check_sysreg_table(gicv3_regs, sz, false); + + if (!valid) + return -EINVAL; + + init_imp_id_regs(); + + ret = populate_nv_trap_config(); - /* Catch someone adding a register without putting in reset entry. */ - memset(&vcpu->arch.ctxt.sys_regs, 0x42, sizeof(vcpu->arch.ctxt.sys_regs)); + check_feature_map(); - /* Generic chip reset first (so target could override). */ - reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++) + ret = populate_sysreg_config(sys_reg_descs + i, i); - table = get_target_table(vcpu->arch.target, true, &num); - reset_sys_reg_descs(vcpu, table, num); + for (i = 0; !ret && i < ARRAY_SIZE(sys_insn_descs); i++) + ret = populate_sysreg_config(sys_insn_descs + i, i); - for (num = 1; num < NR_SYS_REGS; num++) - if (vcpu_sys_reg(vcpu, num) == 0x4242424242424242) - panic("Didn't reset vcpu_sys_reg(%zi)", num); + return ret; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index d50d3722998e..b3f904472fac 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> @@ -5,34 +6,59 @@ * Derived from arch/arm/kvm/coproc.h * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Authors: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__ #define __ARM64_KVM_SYS_REGS_LOCAL_H__ +#include <linux/bsearch.h> + +#define reg_to_encoding(x) \ + sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ + (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) + struct sys_reg_params { u8 Op0; u8 Op1; u8 CRn; u8 CRm; u8 Op2; - u8 Rt; + u64 regval; bool is_write; }; +#define encoding_to_params(reg) \ + ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \ + .Op1 = sys_reg_Op1(reg), \ + .CRn = sys_reg_CRn(reg), \ + .CRm = sys_reg_CRm(reg), \ + .Op2 = sys_reg_Op2(reg) }) + +#define esr_sys64_to_params(esr) \ + ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ + .Op1 = ((esr) >> 14) & 0x7, \ + .CRn = ((esr) >> 10) & 0xf, \ + .CRm = ((esr) >> 1) & 0xf, \ + .Op2 = ((esr) >> 17) & 0x7, \ + .is_write = !((esr) & 1) }) + +#define esr_cp1x_32_to_params(esr) \ + ((struct sys_reg_params){ .Op1 = ((esr) >> 14) & 0x7, \ + .CRn = ((esr) >> 10) & 0xf, \ + .CRm = ((esr) >> 1) & 0xf, \ + .Op2 = ((esr) >> 17) & 0x7, \ + .is_write = !((esr) & 1) }) + struct sys_reg_desc { + /* Sysreg string for debug */ + const char *name; + + enum { + AA32_DIRECT, + AA32_LO, + AA32_HI, + } aarch32_map; + /* MRS/MSR instruction which accesses it. */ u8 Op0; u8 Op1; @@ -42,24 +68,54 @@ struct sys_reg_desc { /* Trapped access from guest, if non-NULL. */ bool (*access)(struct kvm_vcpu *, - const struct sys_reg_params *, + struct sys_reg_params *, const struct sys_reg_desc *); - /* Initialization for vcpu. */ - void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); + /* + * Initialization for vcpu. Return initialized value, or KVM + * sanitized value for ID registers. + */ + u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); /* Index into sys_reg[], or 0 if we don't need to save it. */ int reg; - /* Value (usually reset value) */ + /* Value (usually reset value), or write mask for idregs */ u64 val; + + /* Custom get/set_user functions, fallback to generic if NULL */ + int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val); + int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val); + + /* Return mask of REG_* runtime visibility overrides */ + unsigned int (*visibility)(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd); }; -static inline void print_sys_reg_instr(const struct sys_reg_params *p) +#define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ +#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ +#define REG_USER_WI (1 << 2) /* WI from userspace only */ + +static __printf(2, 3) +inline void print_sys_reg_msg(const struct sys_reg_params *p, + char *fmt, ...) { + va_list va; + + va_start(va, fmt); /* Look, we even formatted it for you to paste into the table! */ - kvm_pr_unimpl(" { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n", - p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read"); + kvm_pr_unimpl("%pV { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n", + &(struct va_format){ fmt, &va }, + p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, str_write_read(p->is_write)); + va_end(va); +} + +static inline void print_sys_reg_instr(const struct sys_reg_params *p) +{ + /* GCC warns on an empty format string */ + print_sys_reg_msg(p, "%s", ""); } static inline bool ignore_write(struct kvm_vcpu *vcpu, @@ -69,44 +125,55 @@ static inline bool ignore_write(struct kvm_vcpu *vcpu, } static inline bool read_zero(struct kvm_vcpu *vcpu, - const struct sys_reg_params *p) + struct sys_reg_params *p) { - *vcpu_reg(vcpu, p->Rt) = 0; + p->regval = 0; return true; } -static inline bool write_to_read_only(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) +/* Reset functions */ +static inline u64 reset_unknown(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) { - kvm_debug("sys_reg write to read-only register at: %lx\n", - *vcpu_pc(vcpu)); - print_sys_reg_instr(params); - return false; + BUG_ON(!r->reg); + BUG_ON(r->reg >= NR_SYS_REGS); + __vcpu_assign_sys_reg(vcpu, r->reg, 0x1de7ec7edbadc0deULL); + return __vcpu_sys_reg(vcpu, r->reg); } -static inline bool read_from_write_only(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) +static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - kvm_debug("sys_reg read to write-only register at: %lx\n", - *vcpu_pc(vcpu)); - print_sys_reg_instr(params); - return false; + BUG_ON(!r->reg); + BUG_ON(r->reg >= NR_SYS_REGS); + __vcpu_assign_sys_reg(vcpu, r->reg, r->val); + return __vcpu_sys_reg(vcpu, r->reg); } -/* Reset functions */ -static inline void reset_unknown(struct kvm_vcpu *vcpu, +static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + if (likely(!r->visibility)) + return 0; + + return r->visibility(vcpu, r); +} + +static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - BUG_ON(!r->reg); - BUG_ON(r->reg >= NR_SYS_REGS); - vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; + return sysreg_visibility(vcpu, r) & REG_HIDDEN; } -static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) { - BUG_ON(!r->reg); - BUG_ON(r->reg >= NR_SYS_REGS); - vcpu_sys_reg(vcpu, r->reg) = r->val; + return sysreg_visibility(vcpu, r) & REG_RAZ; +} + +static inline bool sysreg_user_write_ignore(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return sysreg_visibility(vcpu, r) & REG_USER_WI; } static inline int cmp_sys_reg(const struct sys_reg_desc *i1, @@ -128,11 +195,72 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1, return i1->Op2 - i2->Op2; } +static inline int match_sys_reg(const void *key, const void *elt) +{ + const unsigned long pval = (unsigned long)key; + const struct sys_reg_desc *r = elt; + + return pval - reg_to_encoding(r); +} + +static inline const struct sys_reg_desc * +find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], + unsigned int num) +{ + unsigned long pval = reg_to_encoding(params); + + return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); +} + +const struct sys_reg_desc *get_reg_by_id(u64 id, + const struct sys_reg_desc table[], + unsigned int num); + +int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num); +int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num); +bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); + +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); + +#define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x #define CRn(_x) .CRn = _x #define CRm(_x) .CRm = _x #define Op2(_x) .Op2 = _x +#define SYS_DESC(reg) \ + .name = #reg, \ + Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ + CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ + Op2(sys_reg_Op2(reg)) + +#define CP15_SYS_DESC(reg) \ + .name = #reg, \ + .aarch32_map = AA32_DIRECT, \ + Op0(0), Op1(sys_reg_Op1(reg)), \ + CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ + Op2(sys_reg_Op2(reg)) + +#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ +({ \ + u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ + (val) &= ~reg##_##field##_MASK; \ + (val) |= FIELD_PREP(reg##_##field##_MASK, \ + min(__f_val, \ + (u64)SYS_FIELD_VALUE(reg, field, limit))); \ + (val); \ +}) + +#define TO_ARM64_SYS_REG(r) ARM64_SYS_REG(sys_reg_Op0(SYS_ ## r), \ + sys_reg_Op1(SYS_ ## r), \ + sys_reg_CRn(SYS_ ## r), \ + sys_reg_CRm(SYS_ ## r), \ + sys_reg_Op2(SYS_ ## r)) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ diff --git a/arch/arm64/kvm/sys_regs_generic_v8.c b/arch/arm64/kvm/sys_regs_generic_v8.c deleted file mode 100644 index 4268ab9356b1..000000000000 --- a/arch/arm64/kvm/sys_regs_generic_v8.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * Based on arch/arm/kvm/coproc_a15.c: - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Authors: Rusty Russell <rusty@rustcorp.au> - * Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#include <linux/kvm_host.h> -#include <asm/cputype.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_host.h> -#include <asm/kvm_emulate.h> -#include <asm/kvm_coproc.h> -#include <linux/init.h> - -#include "sys_regs.h" - -static bool access_actlr(struct kvm_vcpu *vcpu, - const struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt) = vcpu_sys_reg(vcpu, ACTLR_EL1); - return true; -} - -static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) -{ - u64 actlr; - - asm volatile("mrs %0, actlr_el1\n" : "=r" (actlr)); - vcpu_sys_reg(vcpu, ACTLR_EL1) = actlr; -} - -/* - * Implementation specific sys-reg registers. - * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 - */ -static const struct sys_reg_desc genericv8_sys_regs[] = { - /* ACTLR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b001), - access_actlr, reset_actlr, ACTLR_EL1 }, -}; - -static const struct sys_reg_desc genericv8_cp15_regs[] = { - /* ACTLR */ - { Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b001), - access_actlr }, -}; - -static struct kvm_sys_reg_target_table genericv8_target_table = { - .table64 = { - .table = genericv8_sys_regs, - .num = ARRAY_SIZE(genericv8_sys_regs), - }, - .table32 = { - .table = genericv8_cp15_regs, - .num = ARRAY_SIZE(genericv8_cp15_regs), - }, -}; - -static int __init sys_reg_genericv8_init(void) -{ - unsigned int i; - - for (i = 1; i < ARRAY_SIZE(genericv8_sys_regs); i++) - BUG_ON(cmp_sys_reg(&genericv8_sys_regs[i-1], - &genericv8_sys_regs[i]) >= 0); - - kvm_register_target_sys_reg_table(KVM_ARM_TARGET_AEM_V8, - &genericv8_target_table); - kvm_register_target_sys_reg_table(KVM_ARM_TARGET_FOUNDATION_V8, - &genericv8_target_table); - kvm_register_target_sys_reg_table(KVM_ARM_TARGET_CORTEX_A57, - &genericv8_target_table); - return 0; -} -late_initcall(sys_reg_genericv8_init); diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h new file mode 100644 index 000000000000..86f9ea47be29 --- /dev/null +++ b/arch/arm64/kvm/trace.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TRACE_ARM64_KVM_H +#define _TRACE_ARM64_KVM_H + +#include "trace_arm.h" +#include "trace_handle_exit.h" + +#endif /* _TRACE_ARM64_KVM_H */ diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h new file mode 100644 index 000000000000..9c60f6465c78 --- /dev/null +++ b/arch/arm64/kvm/trace_arm.h @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ARM_ARM64_KVM_H + +#include <asm/kvm_emulate.h> +#include <kvm/arm_arch_timer.h> +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +/* + * Tracepoints for entry/exit to guest + */ +TRACE_EVENT(kvm_entry, + TP_PROTO(unsigned long vcpu_pc), + TP_ARGS(vcpu_pc), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + ), + + TP_printk("PC: 0x%016lx", __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), + TP_ARGS(ret, esr_ec, vcpu_pc), + + TP_STRUCT__entry( + __field( int, ret ) + __field( unsigned int, esr_ec ) + __field( unsigned long, vcpu_pc ) + ), + + TP_fast_assign( + __entry->ret = ARM_EXCEPTION_CODE(ret); + __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; + __entry->vcpu_pc = vcpu_pc; + ), + + TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%016lx", + __print_symbolic(__entry->ret, kvm_arm_exception_type), + __entry->esr_ec, + __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), + __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_guest_fault, + TP_PROTO(unsigned long vcpu_pc, unsigned long hsr, + unsigned long hxfar, + unsigned long long ipa), + TP_ARGS(vcpu_pc, hsr, hxfar, ipa), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, hsr ) + __field( unsigned long, hxfar ) + __field( unsigned long long, ipa ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->hsr = hsr; + __entry->hxfar = hxfar; + __entry->ipa = ipa; + ), + + TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#016lx", + __entry->ipa, __entry->hsr, + __entry->hxfar, __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_access_fault, + TP_PROTO(unsigned long ipa), + TP_ARGS(ipa), + + TP_STRUCT__entry( + __field( unsigned long, ipa ) + ), + + TP_fast_assign( + __entry->ipa = ipa; + ), + + TP_printk("IPA: %lx", __entry->ipa) +); + +TRACE_EVENT(kvm_irq_line, + TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level), + TP_ARGS(type, vcpu_idx, irq_num, level), + + TP_STRUCT__entry( + __field( unsigned int, type ) + __field( int, vcpu_idx ) + __field( int, irq_num ) + __field( int, level ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->vcpu_idx = vcpu_idx; + __entry->irq_num = irq_num; + __entry->level = level; + ), + + TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d", + (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" : + (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" : + (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN", + __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level) +); + +TRACE_EVENT(kvm_mmio_emulate, + TP_PROTO(unsigned long vcpu_pc, unsigned long instr, + unsigned long cpsr), + TP_ARGS(vcpu_pc, instr, cpsr), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, instr ) + __field( unsigned long, cpsr ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->instr = instr; + __entry->cpsr = cpsr; + ), + + TP_printk("Emulate MMIO at: 0x%016lx (instr: %08lx, cpsr: %08lx)", + __entry->vcpu_pc, __entry->instr, __entry->cpsr) +); + +TRACE_EVENT(kvm_mmio_nisv, + TP_PROTO(unsigned long vcpu_pc, unsigned long esr, + unsigned long far, unsigned long ipa), + TP_ARGS(vcpu_pc, esr, far, ipa), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, esr ) + __field( unsigned long, far ) + __field( unsigned long, ipa ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->esr = esr; + __entry->far = far; + __entry->ipa = ipa; + ), + + TP_printk("ipa %#016lx, esr %#016lx, far %#016lx, pc %#016lx", + __entry->ipa, __entry->esr, + __entry->far, __entry->vcpu_pc) +); + + +TRACE_EVENT(kvm_set_way_flush, + TP_PROTO(unsigned long vcpu_pc, bool cache), + TP_ARGS(vcpu_pc, cache), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, cache ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->cache = cache; + ), + + TP_printk("S/W flush at 0x%016lx (cache %s)", + __entry->vcpu_pc, str_on_off(__entry->cache)) +); + +TRACE_EVENT(kvm_toggle_cache, + TP_PROTO(unsigned long vcpu_pc, bool was, bool now), + TP_ARGS(vcpu_pc, was, now), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, was ) + __field( bool, now ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->was = was; + __entry->now = now; + ), + + TP_printk("VM op at 0x%016lx (cache was %s, now %s)", + __entry->vcpu_pc, str_on_off(__entry->was), + str_on_off(__entry->now)) +); + +/* + * Tracepoints for arch_timer + */ +TRACE_EVENT(kvm_timer_update_irq, + TP_PROTO(unsigned long vcpu_id, __u32 irq, int level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( int, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +TRACE_EVENT(kvm_get_timer_map, + TP_PROTO(unsigned long vcpu_id, struct timer_map *map), + TP_ARGS(vcpu_id, map), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( int, direct_vtimer ) + __field( int, direct_ptimer ) + __field( int, emul_vtimer ) + __field( int, emul_ptimer ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer); + __entry->direct_ptimer = + (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1; + __entry->emul_vtimer = + (map->emul_vtimer) ? arch_timer_ctx_index(map->emul_vtimer) : -1; + __entry->emul_ptimer = + (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1; + ), + + TP_printk("VCPU: %ld, dv: %d, dp: %d, ev: %d, ep: %d", + __entry->vcpu_id, + __entry->direct_vtimer, + __entry->direct_ptimer, + __entry->emul_vtimer, + __entry->emul_ptimer) +); + +TRACE_EVENT(kvm_timer_save_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = timer_get_ctl(ctx); + __entry->cval = timer_get_cval(ctx); + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_restore_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = timer_get_ctl(ctx); + __entry->cval = timer_get_cval(ctx); + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_hrtimer_expire, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_emulate, + TP_PROTO(struct arch_timer_context *ctx, bool should_fire), + TP_ARGS(ctx, should_fire), + + TP_STRUCT__entry( + __field( int, timer_idx ) + __field( bool, should_fire ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + __entry->should_fire = should_fire; + ), + + TP_printk("arch_timer_ctx_index: %d (should_fire: %d)", + __entry->timer_idx, __entry->should_fire) +); + +TRACE_EVENT(kvm_nested_eret, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned long elr_el2, + unsigned long spsr_el2), + TP_ARGS(vcpu, elr_el2, spsr_el2), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(unsigned long, elr_el2) + __field(unsigned long, spsr_el2) + __field(unsigned long, target_mode) + __field(unsigned long, hcr_el2) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->elr_el2 = elr_el2; + __entry->spsr_el2 = spsr_el2; + __entry->target_mode = spsr_el2 & (PSR_MODE_MASK | PSR_MODE32_BIT); + __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2); + ), + + TP_printk("elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx", + __entry->elr_el2, __entry->spsr_el2, + __print_symbolic(__entry->target_mode, kvm_mode_names), + __entry->hcr_el2) +); + +TRACE_EVENT(kvm_inject_nested_exception, + TP_PROTO(struct kvm_vcpu *vcpu, u64 esr_el2, int type), + TP_ARGS(vcpu, esr_el2, type), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(unsigned long, esr_el2) + __field(int, type) + __field(unsigned long, spsr_el2) + __field(unsigned long, pc) + __field(unsigned long, source_mode) + __field(unsigned long, hcr_el2) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->esr_el2 = esr_el2; + __entry->type = type; + __entry->spsr_el2 = *vcpu_cpsr(vcpu); + __entry->pc = *vcpu_pc(vcpu); + __entry->source_mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2); + ), + + TP_printk("%s: esr_el2 0x%lx elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx", + __print_symbolic(__entry->type, kvm_exception_type_names), + __entry->esr_el2, __entry->pc, __entry->spsr_el2, + __print_symbolic(__entry->source_mode, kvm_mode_names), + __entry->hcr_el2) +); + +TRACE_EVENT(kvm_forward_sysreg_trap, + TP_PROTO(struct kvm_vcpu *vcpu, u32 sysreg, bool is_read), + TP_ARGS(vcpu, sysreg, is_read), + + TP_STRUCT__entry( + __field(u64, pc) + __field(u32, sysreg) + __field(bool, is_read) + ), + + TP_fast_assign( + __entry->pc = *vcpu_pc(vcpu); + __entry->sysreg = sysreg; + __entry->is_read = is_read; + ), + + TP_printk("%llx %c (%d,%d,%d,%d,%d)", + __entry->pc, + __entry->is_read ? 'R' : 'W', + sys_reg_Op0(__entry->sysreg), + sys_reg_Op1(__entry->sysreg), + sys_reg_CRn(__entry->sysreg), + sys_reg_CRm(__entry->sysreg), + sys_reg_Op2(__entry->sysreg)) +); + +#endif /* _TRACE_ARM_ARM64_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_arm + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h new file mode 100644 index 000000000000..a7ab9a3bbed0 --- /dev/null +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_HANDLE_EXIT_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HANDLE_EXIT_ARM64_KVM_H + +#include <linux/tracepoint.h> +#include "sys_regs.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +TRACE_EVENT(kvm_wfx_arm64, + TP_PROTO(unsigned long vcpu_pc, bool is_wfe), + TP_ARGS(vcpu_pc, is_wfe), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_wfe) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_wfe = is_wfe; + ), + + TP_printk("guest executed wf%c at: 0x%016lx", + __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_hvc_arm64, + TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), + TP_ARGS(vcpu_pc, r0, imm), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(unsigned long, r0) + __field(unsigned long, imm) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->r0 = r0; + __entry->imm = imm; + ), + + TP_printk("HVC at 0x%016lx (r0: 0x%016lx, imm: 0x%lx)", + __entry->vcpu_pc, __entry->r0, __entry->imm) +); + +/* + * The dreg32 name is a leftover from a distant past. This will really + * output a 64bit value... + */ +TRACE_EVENT(kvm_arm_set_dreg32, + TP_PROTO(const char *name, __u64 value), + TP_ARGS(name, value), + + TP_STRUCT__entry( + __field(const char *, name) + __field(__u64, value) + ), + + TP_fast_assign( + __entry->name = name; + __entry->value = value; + ), + + TP_printk("%s: 0x%llx", __entry->name, __entry->value) +); + +TRACE_EVENT(kvm_handle_sys_reg, + TP_PROTO(unsigned long hsr), + TP_ARGS(hsr), + + TP_STRUCT__entry( + __field(unsigned long, hsr) + ), + + TP_fast_assign( + __entry->hsr = hsr; + ), + + TP_printk("HSR 0x%08lx", __entry->hsr) +); + +TRACE_EVENT(kvm_sys_access, + TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), + TP_ARGS(vcpu_pc, params, reg), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_write) + __field(const char *, name) + __field(u8, Op0) + __field(u8, Op1) + __field(u8, CRn) + __field(u8, CRm) + __field(u8, Op2) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_write = params->is_write; + __entry->name = reg->name; + __entry->Op0 = reg->Op0; + __entry->Op0 = reg->Op0; + __entry->Op1 = reg->Op1; + __entry->CRn = reg->CRn; + __entry->CRm = reg->CRm; + __entry->Op2 = reg->Op2; + ), + + TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", + __entry->vcpu_pc, __entry->name ?: "UNKN", + __entry->Op0, __entry->Op1, __entry->CRn, + __entry->CRm, __entry->Op2, + str_write_read(__entry->is_write)) +); + +TRACE_EVENT(kvm_set_guest_debug, + TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), + TP_ARGS(vcpu, guest_debug), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->guest_debug = guest_debug; + ), + + TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) +); + +#endif /* _TRACE_HANDLE_EXIT_ARM64_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_handle_exit + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/arm64/kvm/trng.c b/arch/arm64/kvm/trng.c new file mode 100644 index 000000000000..99bdd7103c9c --- /dev/null +++ b/arch/arm64/kvm/trng.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 Arm Ltd. + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> + +#include <kvm/arm_hypercalls.h> + +#define ARM_SMCCC_TRNG_VERSION_1_0 0x10000UL + +/* Those values are deliberately separate from the generic SMCCC definitions. */ +#define TRNG_SUCCESS 0UL +#define TRNG_NOT_SUPPORTED ((unsigned long)-1) +#define TRNG_INVALID_PARAMETER ((unsigned long)-2) +#define TRNG_NO_ENTROPY ((unsigned long)-3) + +#define TRNG_MAX_BITS64 192 + +static const uuid_t arm_smc_trng_uuid __aligned(4) = UUID_INIT( + 0x0d21e000, 0x4384, 0x11eb, 0x80, 0x70, 0x52, 0x44, 0x55, 0x4e, 0x5a, 0x4c); + +static int kvm_trng_do_rnd(struct kvm_vcpu *vcpu, int size) +{ + DECLARE_BITMAP(bits, TRNG_MAX_BITS64); + u32 num_bits = smccc_get_arg1(vcpu); + int i; + + if (num_bits > 3 * size) { + smccc_set_retval(vcpu, TRNG_INVALID_PARAMETER, 0, 0, 0); + return 1; + } + + /* get as many bits as we need to fulfil the request */ + for (i = 0; i < DIV_ROUND_UP(num_bits, BITS_PER_LONG); i++) + bits[i] = get_random_long(); + + bitmap_clear(bits, num_bits, TRNG_MAX_BITS64 - num_bits); + + if (size == 32) + smccc_set_retval(vcpu, TRNG_SUCCESS, lower_32_bits(bits[1]), + upper_32_bits(bits[0]), lower_32_bits(bits[0])); + else + smccc_set_retval(vcpu, TRNG_SUCCESS, bits[2], bits[1], bits[0]); + + memzero_explicit(bits, sizeof(bits)); + return 1; +} + +int kvm_trng_call(struct kvm_vcpu *vcpu) +{ + const __le32 *u = (__le32 *)arm_smc_trng_uuid.b; + u32 func_id = smccc_get_function(vcpu); + unsigned long val = TRNG_NOT_SUPPORTED; + int size = 64; + + switch (func_id) { + case ARM_SMCCC_TRNG_VERSION: + val = ARM_SMCCC_TRNG_VERSION_1_0; + break; + case ARM_SMCCC_TRNG_FEATURES: + switch (smccc_get_arg1(vcpu)) { + case ARM_SMCCC_TRNG_VERSION: + case ARM_SMCCC_TRNG_FEATURES: + case ARM_SMCCC_TRNG_GET_UUID: + case ARM_SMCCC_TRNG_RND32: + case ARM_SMCCC_TRNG_RND64: + val = TRNG_SUCCESS; + } + break; + case ARM_SMCCC_TRNG_GET_UUID: + smccc_set_retval(vcpu, le32_to_cpu(u[0]), le32_to_cpu(u[1]), + le32_to_cpu(u[2]), le32_to_cpu(u[3])); + return 1; + case ARM_SMCCC_TRNG_RND32: + size = 32; + fallthrough; + case ARM_SMCCC_TRNG_RND64: + return kvm_trng_do_rnd(vcpu, size); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c new file mode 100644 index 000000000000..91b22a014610 --- /dev/null +++ b/arch/arm64/kvm/va_layout.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/kvm_host.h> +#include <linux/random.h> +#include <linux/memblock.h> +#include <asm/alternative.h> +#include <asm/debug-monitors.h> +#include <asm/insn.h> +#include <asm/kvm_mmu.h> +#include <asm/memory.h> + +/* + * The LSB of the HYP VA tag + */ +static u8 tag_lsb; +/* + * The HYP VA tag value with the region bit + */ +static u64 tag_val; +static u64 va_mask; + +/* + * Compute HYP VA by using the same computation as kern_hyp_va(). + */ +static u64 __early_kern_hyp_va(u64 addr) +{ + addr &= va_mask; + addr |= tag_val << tag_lsb; + return addr; +} + +/* + * Store a hyp VA <-> PA offset into a EL2-owned variable. + */ +static void init_hyp_physvirt_offset(void) +{ + u64 kern_va, hyp_va; + + /* Compute the offset from the hyp VA and PA of a random symbol. */ + kern_va = (u64)lm_alias(__hyp_text_start); + hyp_va = __early_kern_hyp_va(kern_va); + hyp_physvirt_offset = (s64)__pa(kern_va) - (s64)hyp_va; +} + +/* + * We want to generate a hyp VA with the following format (with V == + * vabits_actual): + * + * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0 + * --------------------------------------------------------- + * | 0000000 | hyp_va_msb | random tag | kern linear VA | + * |--------- tag_val -----------|----- va_mask ---| + * + * which does not conflict with the idmap regions. + */ +__init void kvm_compute_layout(void) +{ + phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start); + u64 hyp_va_msb; + + /* Where is my RAM region? */ + hyp_va_msb = idmap_addr & BIT(vabits_actual - 1); + hyp_va_msb ^= BIT(vabits_actual - 1); + + tag_lsb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^ + (u64)(high_memory - 1)); + + va_mask = GENMASK_ULL(tag_lsb - 1, 0); + tag_val = hyp_va_msb; + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && tag_lsb != (vabits_actual - 1)) { + /* We have some free bits to insert a random tag. */ + tag_val |= get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb); + } + tag_val >>= tag_lsb; + + init_hyp_physvirt_offset(); +} + +/* + * The .hyp.reloc ELF section contains a list of kimg positions that + * contains kimg VAs but will be accessed only in hyp execution context. + * Convert them to hyp VAs. See gen-hyprel.c for more details. + */ +__init void kvm_apply_hyp_relocations(void) +{ + int32_t *rel; + int32_t *begin = (int32_t *)__hyp_reloc_begin; + int32_t *end = (int32_t *)__hyp_reloc_end; + + for (rel = begin; rel < end; ++rel) { + uintptr_t *ptr, kimg_va; + + /* + * Each entry contains a 32-bit relative offset from itself + * to a kimg VA position. + */ + ptr = (uintptr_t *)lm_alias((char *)rel + *rel); + + /* Read the kimg VA value at the relocation address. */ + kimg_va = *ptr; + + /* Convert to hyp VA and store back to the relocation address. */ + *ptr = __early_kern_hyp_va((uintptr_t)lm_alias(kimg_va)); + } +} + +static u32 compute_instruction(int n, u32 rd, u32 rn) +{ + u32 insn = AARCH64_BREAK_FAULT; + + switch (n) { + case 0: + insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_AND, + AARCH64_INSN_VARIANT_64BIT, + rn, rd, va_mask); + break; + + case 1: + /* ROR is a variant of EXTR with Rm = Rn */ + insn = aarch64_insn_gen_extr(AARCH64_INSN_VARIANT_64BIT, + rn, rn, rd, + tag_lsb); + break; + + case 2: + insn = aarch64_insn_gen_add_sub_imm(rd, rn, + tag_val & GENMASK(11, 0), + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_ADSB_ADD); + break; + + case 3: + insn = aarch64_insn_gen_add_sub_imm(rd, rn, + tag_val & GENMASK(23, 12), + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_ADSB_ADD); + break; + + case 4: + /* ROR is a variant of EXTR with Rm = Rn */ + insn = aarch64_insn_gen_extr(AARCH64_INSN_VARIANT_64BIT, + rn, rn, rd, 64 - tag_lsb); + break; + } + + return insn; +} + +void __init kvm_update_va_mask(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + int i; + + BUG_ON(nr_inst != 5); + + for (i = 0; i < nr_inst; i++) { + u32 rd, rn, insn, oinsn; + + /* + * VHE doesn't need any address translation, let's NOP + * everything. + * + * Alternatively, if the tag is zero (because the layout + * dictates it and we don't have any spare bits in the + * address), NOP everything after masking the kernel VA. + */ + if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN) || (!tag_val && i > 0)) { + updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); + continue; + } + + oinsn = le32_to_cpu(origptr[i]); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn); + rn = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, oinsn); + + insn = compute_instruction(i, rd, rn); + BUG_ON(insn == AARCH64_BREAK_FAULT); + + updptr[i] = cpu_to_le32(insn); + } +} + +void kvm_patch_vector_branch(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u64 addr; + u32 insn; + + BUG_ON(nr_inst != 4); + + if (!cpus_have_cap(ARM64_SPECTRE_V3A) || + WARN_ON_ONCE(cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))) + return; + + /* + * Compute HYP VA by using the same computation as kern_hyp_va() + */ + addr = __early_kern_hyp_va((u64)kvm_ksym_ref(__kvm_hyp_vector)); + + /* Use PC[10:7] to branch to the same vector in KVM */ + addr |= ((u64)origptr & GENMASK_ULL(10, 7)); + + /* + * Branch over the preamble in order to avoid the initial store on + * the stack (which we already perform in the hardening vectors). + */ + addr += KVM_VECTOR_PREAMBLE; + + /* movz x0, #(addr & 0xffff) */ + insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0, + (u16)addr, + 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); + + /* movk x0, #((addr >> 16) & 0xffff), lsl #16 */ + insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0, + (u16)(addr >> 16), + 16, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_KEEP); + *updptr++ = cpu_to_le32(insn); + + /* movk x0, #((addr >> 32) & 0xffff), lsl #32 */ + insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0, + (u16)(addr >> 32), + 32, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_KEEP); + *updptr++ = cpu_to_le32(insn); + + /* br x0 */ + insn = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_0, + AARCH64_INSN_BRANCH_NOLINK); + *updptr++ = cpu_to_le32(insn); +} + +static void generate_mov_q(u64 val, __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u32 insn, oinsn, rd; + + BUG_ON(nr_inst != 4); + + /* Compute target register */ + oinsn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn); + + /* movz rd, #(val & 0xffff) */ + insn = aarch64_insn_gen_movewide(rd, + (u16)val, + 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); + + /* movk rd, #((val >> 16) & 0xffff), lsl #16 */ + insn = aarch64_insn_gen_movewide(rd, + (u16)(val >> 16), + 16, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_KEEP); + *updptr++ = cpu_to_le32(insn); + + /* movk rd, #((val >> 32) & 0xffff), lsl #32 */ + insn = aarch64_insn_gen_movewide(rd, + (u16)(val >> 32), + 32, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_KEEP); + *updptr++ = cpu_to_le32(insn); + + /* movk rd, #((val >> 48) & 0xffff), lsl #48 */ + insn = aarch64_insn_gen_movewide(rd, + (u16)(val >> 48), + 48, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_KEEP); + *updptr++ = cpu_to_le32(insn); +} + +void kvm_get_kimage_voffset(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + generate_mov_q(kimage_voffset, origptr, updptr, nr_inst); +} + +void kvm_compute_final_ctr_el0(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + generate_mov_q(read_sanitised_ftr_reg(SYS_CTR_EL0), + origptr, updptr, nr_inst); +} diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c new file mode 100644 index 000000000000..bdc2d57370b2 --- /dev/null +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -0,0 +1,489 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGIC system registers handling functions for AArch64 mode + */ + +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include "vgic/vgic.h" +#include "sys_regs.h" + +static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u32 host_pri_bits, host_id_bits, host_seis, host_a3v, seis, a3v; + struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + + /* + * Disallow restoring VM state if not supported by this + * hardware. + */ + host_pri_bits = FIELD_GET(ICC_CTLR_EL1_PRI_BITS_MASK, val) + 1; + if (host_pri_bits > vgic_v3_cpu->num_pri_bits) + return -EINVAL; + + vgic_v3_cpu->num_pri_bits = host_pri_bits; + + host_id_bits = FIELD_GET(ICC_CTLR_EL1_ID_BITS_MASK, val); + if (host_id_bits > vgic_v3_cpu->num_id_bits) + return -EINVAL; + + vgic_v3_cpu->num_id_bits = host_id_bits; + + host_seis = FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2); + seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val); + if (host_seis != seis) + return -EINVAL; + + host_a3v = FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2); + a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val); + if (host_a3v != a3v) + return -EINVAL; + + /* + * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. + * The vgic_set_vmcr() will convert to ICH_VMCR layout. + */ + vmcr.cbpr = FIELD_GET(ICC_CTLR_EL1_CBPR_MASK, val); + vmcr.eoim = FIELD_GET(ICC_CTLR_EL1_EOImode_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *valp) +{ + struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; + struct vgic_vmcr vmcr; + u64 val; + + vgic_get_vmcr(vcpu, &vmcr); + val = 0; + val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1); + val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits); + val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK, + FIELD_GET(ICH_VTR_EL2_SEIS, + kvm_vgic_global_state.ich_vtr_el2)); + val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK, + FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2)); + /* + * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. + * Extract it directly using ICC_CTLR_EL1 reg definitions. + */ + val |= FIELD_PREP(ICC_CTLR_EL1_CBPR_MASK, vmcr.cbpr); + val |= FIELD_PREP(ICC_CTLR_EL1_EOImode_MASK, vmcr.eoim); + + *valp = val; + + return 0; +} + +static int set_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + vmcr.pmr = FIELD_GET(ICC_PMR_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = FIELD_PREP(ICC_PMR_EL1_MASK, vmcr.pmr); + + return 0; +} + +static int set_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + vmcr.bpr = FIELD_GET(ICC_BPR0_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = FIELD_PREP(ICC_BPR0_EL1_MASK, vmcr.bpr); + + return 0; +} + +static int set_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + if (!vmcr.cbpr) { + vmcr.abpr = FIELD_GET(ICC_BPR1_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + } + + return 0; +} + +static int get_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + if (!vmcr.cbpr) + *val = FIELD_PREP(ICC_BPR1_EL1_MASK, vmcr.abpr); + else + *val = min((vmcr.bpr + 1), 7U); + + + return 0; +} + +static int set_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + vmcr.grpen0 = FIELD_GET(ICC_IGRPEN0_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = FIELD_PREP(ICC_IGRPEN0_EL1_MASK, vmcr.grpen0); + + return 0; +} + +static int set_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + vmcr.grpen1 = FIELD_GET(ICC_IGRPEN1_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = FIELD_GET(ICC_IGRPEN1_EL1_MASK, vmcr.grpen1); + + return 0; +} + +static void set_apr_reg(struct kvm_vcpu *vcpu, u64 val, u8 apr, u8 idx) +{ + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (apr) + vgicv3->vgic_ap1r[idx] = val; + else + vgicv3->vgic_ap0r[idx] = val; +} + +static u64 get_apr_reg(struct kvm_vcpu *vcpu, u8 apr, u8 idx) +{ + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (apr) + return vgicv3->vgic_ap1r[idx]; + else + return vgicv3->vgic_ap0r[idx]; +} + +static int set_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) + +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + set_apr_reg(vcpu, val, 0, idx); + return 0; +} + +static int get_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + *val = get_apr_reg(vcpu, 0, idx); + + return 0; +} + +static int set_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) + +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + set_apr_reg(vcpu, val, 1, idx); + return 0; +} + +static int get_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + *val = get_apr_reg(vcpu, 1, idx); + + return 0; +} + +static int set_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + /* Validate SRE bit */ + if (!(val & ICC_SRE_EL1_SRE)) + return -EINVAL; + + return 0; +} + +static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + *val = vgicv3->vgic_sre; + + return 0; +} + +static int set_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + __vcpu_assign_sys_reg(vcpu, r->reg, val); + return 0; +} + +static int get_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = __vcpu_sys_reg(vcpu, r->reg); + return 0; +} + +static int set_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + return set_gic_ich_reg(vcpu, r, val); +} + +static int get_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + return get_gic_ich_reg(vcpu, r, val); +} + +static int set_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + if (val != KVM_ICC_SRE_EL2) + return -EINVAL; + return 0; +} + +static int get_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = KVM_ICC_SRE_EL2; + return 0; +} + +static int set_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + if (val != kvm_get_guest_vtr_el2()) + return -EINVAL; + return 0; +} + +static int get_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = kvm_get_guest_vtr_el2(); + return 0; +} + +static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return vcpu_has_nv(vcpu) ? 0 : REG_HIDDEN; +} + +#define __EL2_REG(r, acc, i) \ + { \ + SYS_DESC(SYS_ ## r), \ + .get_user = get_gic_ ## acc, \ + .set_user = set_gic_ ## acc, \ + .reg = i, \ + .visibility = el2_visibility, \ + } + +#define EL2_REG(r, acc) __EL2_REG(r, acc, r) + +#define EL2_REG_RO(r, acc) __EL2_REG(r, acc, 0) + +static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { + { SYS_DESC(SYS_ICC_PMR_EL1), + .set_user = set_gic_pmr, .get_user = get_gic_pmr, }, + { SYS_DESC(SYS_ICC_BPR0_EL1), + .set_user = set_gic_bpr0, .get_user = get_gic_bpr0, }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_BPR1_EL1), + .set_user = set_gic_bpr1, .get_user = get_gic_bpr1, }, + { SYS_DESC(SYS_ICC_CTLR_EL1), + .set_user = set_gic_ctlr, .get_user = get_gic_ctlr, }, + { SYS_DESC(SYS_ICC_SRE_EL1), + .set_user = set_gic_sre, .get_user = get_gic_sre, }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), + .set_user = set_gic_grpen0, .get_user = get_gic_grpen0, }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), + .set_user = set_gic_grpen1, .get_user = get_gic_grpen1, }, + EL2_REG(ICH_AP0R0_EL2, ich_apr), + EL2_REG(ICH_AP0R1_EL2, ich_apr), + EL2_REG(ICH_AP0R2_EL2, ich_apr), + EL2_REG(ICH_AP0R3_EL2, ich_apr), + EL2_REG(ICH_AP1R0_EL2, ich_apr), + EL2_REG(ICH_AP1R1_EL2, ich_apr), + EL2_REG(ICH_AP1R2_EL2, ich_apr), + EL2_REG(ICH_AP1R3_EL2, ich_apr), + EL2_REG_RO(ICC_SRE_EL2, icc_sre), + EL2_REG(ICH_HCR_EL2, ich_reg), + EL2_REG_RO(ICH_VTR_EL2, ich_vtr), + EL2_REG(ICH_VMCR_EL2, ich_reg), + EL2_REG(ICH_LR0_EL2, ich_reg), + EL2_REG(ICH_LR1_EL2, ich_reg), + EL2_REG(ICH_LR2_EL2, ich_reg), + EL2_REG(ICH_LR3_EL2, ich_reg), + EL2_REG(ICH_LR4_EL2, ich_reg), + EL2_REG(ICH_LR5_EL2, ich_reg), + EL2_REG(ICH_LR6_EL2, ich_reg), + EL2_REG(ICH_LR7_EL2, ich_reg), + EL2_REG(ICH_LR8_EL2, ich_reg), + EL2_REG(ICH_LR9_EL2, ich_reg), + EL2_REG(ICH_LR10_EL2, ich_reg), + EL2_REG(ICH_LR11_EL2, ich_reg), + EL2_REG(ICH_LR12_EL2, ich_reg), + EL2_REG(ICH_LR13_EL2, ich_reg), + EL2_REG(ICH_LR14_EL2, ich_reg), + EL2_REG(ICH_LR15_EL2, ich_reg), +}; + +const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz) +{ + *sz = ARRAY_SIZE(gic_v3_icc_reg_descs); + return gic_v3_icc_reg_descs; +} + +static u64 attr_to_id(u64 attr) +{ + return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP1_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRN_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRM_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP2_MASK, attr)); +} + +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + const struct sys_reg_desc *r; + + r = get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); + + if (r && !sysreg_hidden(vcpu, r)) + return 0; + + return -ENXIO; +} + +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr, + bool is_write) +{ + struct kvm_one_reg reg = { + .id = attr_to_id(attr->attr), + .addr = attr->addr, + }; + + if (is_write) + return kvm_sys_reg_set_user(vcpu, ®, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); + else + return kvm_sys_reg_get_user(vcpu, ®, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); +} diff --git a/arch/arm64/kvm/vgic/trace.h b/arch/arm64/kvm/vgic/trace.h new file mode 100644 index 000000000000..83c64401a7fc --- /dev/null +++ b/arch/arm64/kvm/vgic/trace.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VGIC_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +TRACE_EVENT(vgic_update_irq_pending, + TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( bool, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level: %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +#endif /* _TRACE_VGIC_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/arm64/kvm/vgic +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c new file mode 100644 index 000000000000..bb92853d1fd3 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -0,0 +1,557 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 Linaro + * Author: Christoffer Dall <christoffer.dall@linaro.org> + */ + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/interrupt.h> +#include <linux/kvm_host.h> +#include <linux/seq_file.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_mmu.h> +#include "vgic.h" + +/* + * Structure to control looping through the entire vgic state. We start at + * zero for each field and move upwards. So, if dist_id is 0 we print the + * distributor info. When dist_id is 1, we have already printed it and move + * on. + * + * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and + * so on. + */ +struct vgic_state_iter { + int nr_cpus; + int nr_spis; + int nr_lpis; + int dist_id; + int vcpu_id; + unsigned long intid; + int lpi_idx; +}; + +static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + if (iter->dist_id == 0) { + iter->dist_id++; + return; + } + + /* + * Let the xarray drive the iterator after the last SPI, as the iterator + * has exhausted the sequentially-allocated INTID space. + */ + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) && + iter->nr_lpis) { + if (iter->lpi_idx < iter->nr_lpis) + xa_find_after(&dist->lpi_xa, &iter->intid, + VGIC_LPI_MAX_INTID, + LPI_XA_MARK_DEBUG_ITER); + iter->lpi_idx++; + return; + } + + iter->intid++; + if (iter->intid == VGIC_NR_PRIVATE_IRQS && + ++iter->vcpu_id < iter->nr_cpus) + iter->intid = 0; +} + +static int iter_mark_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid, flags; + struct vgic_irq *irq; + int nr_lpis = 0; + + xa_lock_irqsave(&dist->lpi_xa, flags); + + xa_for_each(&dist->lpi_xa, intid, irq) { + if (!vgic_try_get_irq_ref(irq)) + continue; + + __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + nr_lpis++; + } + + xa_unlock_irqrestore(&dist->lpi_xa, flags); + + return nr_lpis; +} + +static void iter_unmark_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid, flags; + struct vgic_irq *irq; + + xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { + xa_lock_irqsave(&dist->lpi_xa, flags); + __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + xa_unlock_irqrestore(&dist->lpi_xa, flags); + + /* vgic_put_irq() expects to be called outside of the xa_lock */ + vgic_put_irq(kvm, irq); + } +} + +static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, + loff_t pos) +{ + int nr_cpus = atomic_read(&kvm->online_vcpus); + + memset(iter, 0, sizeof(*iter)); + + iter->nr_cpus = nr_cpus; + iter->nr_spis = kvm->arch.vgic.nr_spis; + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + iter->nr_lpis = iter_mark_lpis(kvm); + + /* Fast forward to the right position if needed */ + while (pos--) + iter_next(kvm, iter); +} + +static bool end_of_vgic(struct vgic_state_iter *iter) +{ + return iter->dist_id > 0 && + iter->vcpu_id == iter->nr_cpus && + iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && + (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis); +} + +static void *vgic_debug_start(struct seq_file *s, loff_t *pos) +{ + struct kvm *kvm = s->private; + struct vgic_state_iter *iter; + + mutex_lock(&kvm->arch.config_lock); + iter = kvm->arch.vgic.iter; + if (iter) { + iter = ERR_PTR(-EBUSY); + goto out; + } + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + iter = ERR_PTR(-ENOMEM); + goto out; + } + + iter_init(kvm, iter, *pos); + kvm->arch.vgic.iter = iter; + + if (end_of_vgic(iter)) + iter = NULL; +out: + mutex_unlock(&kvm->arch.config_lock); + return iter; +} + +static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kvm *kvm = s->private; + struct vgic_state_iter *iter = kvm->arch.vgic.iter; + + ++*pos; + iter_next(kvm, iter); + if (end_of_vgic(iter)) + iter = NULL; + return iter; +} + +static void vgic_debug_stop(struct seq_file *s, void *v) +{ + struct kvm *kvm = s->private; + struct vgic_state_iter *iter; + + /* + * If the seq file wasn't properly opened, there's nothing to clearn + * up. + */ + if (IS_ERR(v)) + return; + + mutex_lock(&kvm->arch.config_lock); + iter = kvm->arch.vgic.iter; + iter_unmark_lpis(kvm); + kfree(iter); + kvm->arch.vgic.iter = NULL; + mutex_unlock(&kvm->arch.config_lock); +} + +static void print_dist_state(struct seq_file *s, struct vgic_dist *dist, + struct vgic_state_iter *iter) +{ + bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; + + seq_printf(s, "Distributor\n"); + seq_printf(s, "===========\n"); + seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); + seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); + if (v3) + seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis); + seq_printf(s, "enabled:\t%d\n", dist->enabled); + seq_printf(s, "\n"); + + seq_printf(s, "P=pending_latch, L=line_level, A=active\n"); + seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n"); + seq_printf(s, "G=group\n"); +} + +static void print_header(struct seq_file *s, struct vgic_irq *irq, + struct kvm_vcpu *vcpu) +{ + int id = 0; + char *hdr = "SPI "; + + if (vcpu) { + hdr = "VCPU"; + id = vcpu->vcpu_idx; + } + + seq_printf(s, "\n"); + seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHCG HWID TARGET SRC PRI VCPU_ID\n", hdr, id); + seq_printf(s, "----------------------------------------------------------------\n"); +} + +static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, + struct kvm_vcpu *vcpu) +{ + char *type; + bool pending; + + if (irq->intid < VGIC_NR_SGIS) + type = "SGI"; + else if (irq->intid < VGIC_NR_PRIVATE_IRQS) + type = "PPI"; + else if (irq->intid < VGIC_MAX_SPI) + type = "SPI"; + else + type = "LPI"; + + if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS) + print_header(s, irq, vcpu); + + pending = irq->pending_latch; + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + int err; + + err = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &pending); + WARN_ON_ONCE(err); + } + + seq_printf(s, " %s %4d " + " %2d " + "%d%d%d%d%d%d%d " + "%8d " + "%8x " + " %2x " + "%3d " + " %2d " + "\n", + type, irq->intid, + (irq->target_vcpu) ? irq->target_vcpu->vcpu_idx : -1, + pending, + irq->line_level, + irq->active, + irq->enabled, + irq->hw, + irq->config == VGIC_CONFIG_LEVEL, + irq->group, + irq->hwintid, + irq->mpidr, + irq->source, + irq->priority, + (irq->vcpu) ? irq->vcpu->vcpu_idx : -1); +} + +static int vgic_debug_show(struct seq_file *s, void *v) +{ + struct kvm *kvm = s->private; + struct vgic_state_iter *iter = v; + struct vgic_irq *irq; + struct kvm_vcpu *vcpu = NULL; + unsigned long flags; + + if (iter->dist_id == 0) { + print_dist_state(s, &kvm->arch.vgic, iter); + return 0; + } + + if (!kvm->arch.vgic.initialized) + return 0; + + if (iter->vcpu_id < iter->nr_cpus) + vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); + + /* + * Expect this to succeed, as iter_mark_lpis() takes a reference on + * every LPI to be visited. + */ + if (iter->intid < VGIC_NR_PRIVATE_IRQS) + irq = vgic_get_vcpu_irq(vcpu, iter->intid); + else + irq = vgic_get_irq(kvm, iter->intid); + if (WARN_ON_ONCE(!irq)) + return -EINVAL; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + print_irq_state(s, irq, vcpu); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(kvm, irq); + return 0; +} + +static const struct seq_operations vgic_debug_sops = { + .start = vgic_debug_start, + .next = vgic_debug_next, + .stop = vgic_debug_stop, + .show = vgic_debug_show +}; + +DEFINE_SEQ_ATTRIBUTE(vgic_debug); + +void vgic_debug_init(struct kvm *kvm) +{ + debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm, + &vgic_debug_fops); +} + +void vgic_debug_destroy(struct kvm *kvm) +{ +} + +/** + * struct vgic_its_iter - Iterator for traversing VGIC ITS device tables. + * @dev: Pointer to the current its_device being processed. + * @ite: Pointer to the current its_ite within the device being processed. + * + * This structure is used to maintain the current position during iteration + * over the ITS device tables. It holds pointers to both the current device + * and the current ITE within that device. + */ +struct vgic_its_iter { + struct its_device *dev; + struct its_ite *ite; +}; + +/** + * end_of_iter - Checks if the iterator has reached the end. + * @iter: The iterator to check. + * + * When the iterator completed processing the final ITE in the last device + * table, it was marked to indicate the end of iteration by setting its + * device and ITE pointers to NULL. + * This function checks whether the iterator was marked as end. + * + * Return: True if the iterator is marked as end, false otherwise. + */ +static inline bool end_of_iter(struct vgic_its_iter *iter) +{ + return !iter->dev && !iter->ite; +} + +/** + * vgic_its_iter_next - Advances the iterator to the next entry in the ITS tables. + * @its: The VGIC ITS structure. + * @iter: The iterator to advance. + * + * This function moves the iterator to the next ITE within the current device, + * or to the first ITE of the next device if the current ITE is the last in + * the device. If the current device is the last device, the iterator is set + * to indicate the end of iteration. + */ +static void vgic_its_iter_next(struct vgic_its *its, struct vgic_its_iter *iter) +{ + struct its_device *dev = iter->dev; + struct its_ite *ite = iter->ite; + + if (!ite || list_is_last(&ite->ite_list, &dev->itt_head)) { + if (list_is_last(&dev->dev_list, &its->device_list)) { + dev = NULL; + ite = NULL; + } else { + dev = list_next_entry(dev, dev_list); + ite = list_first_entry_or_null(&dev->itt_head, + struct its_ite, + ite_list); + } + } else { + ite = list_next_entry(ite, ite_list); + } + + iter->dev = dev; + iter->ite = ite; +} + +/** + * vgic_its_debug_start - Start function for the seq_file interface. + * @s: The seq_file structure. + * @pos: The starting position (offset). + * + * This function initializes the iterator to the beginning of the ITS tables + * and advances it to the specified position. It acquires the its_lock mutex + * to protect shared data. + * + * Return: An iterator pointer on success, NULL if no devices are found or + * the end of the list is reached, or ERR_PTR(-ENOMEM) on memory + * allocation failure. + */ +static void *vgic_its_debug_start(struct seq_file *s, loff_t *pos) +{ + struct vgic_its *its = s->private; + struct vgic_its_iter *iter; + struct its_device *dev; + loff_t offset = *pos; + + mutex_lock(&its->its_lock); + + dev = list_first_entry_or_null(&its->device_list, + struct its_device, dev_list); + if (!dev) + return NULL; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return ERR_PTR(-ENOMEM); + + iter->dev = dev; + iter->ite = list_first_entry_or_null(&dev->itt_head, + struct its_ite, ite_list); + + while (!end_of_iter(iter) && offset--) + vgic_its_iter_next(its, iter); + + if (end_of_iter(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +/** + * vgic_its_debug_next - Next function for the seq_file interface. + * @s: The seq_file structure. + * @v: The current iterator. + * @pos: The current position (offset). + * + * This function advances the iterator to the next entry and increments the + * position. + * + * Return: An iterator pointer on success, or NULL if the end of the list is + * reached. + */ +static void *vgic_its_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct vgic_its *its = s->private; + struct vgic_its_iter *iter = v; + + ++*pos; + vgic_its_iter_next(its, iter); + + if (end_of_iter(iter)) { + kfree(iter); + return NULL; + } + return iter; +} + +/** + * vgic_its_debug_stop - Stop function for the seq_file interface. + * @s: The seq_file structure. + * @v: The current iterator. + * + * This function frees the iterator and releases the its_lock mutex. + */ +static void vgic_its_debug_stop(struct seq_file *s, void *v) +{ + struct vgic_its *its = s->private; + struct vgic_its_iter *iter = v; + + if (!IS_ERR_OR_NULL(iter)) + kfree(iter); + mutex_unlock(&its->its_lock); +} + +/** + * vgic_its_debug_show - Show function for the seq_file interface. + * @s: The seq_file structure. + * @v: The current iterator. + * + * This function formats and prints the ITS table entry information to the + * seq_file output. + * + * Return: 0 on success. + */ +static int vgic_its_debug_show(struct seq_file *s, void *v) +{ + struct vgic_its_iter *iter = v; + struct its_device *dev = iter->dev; + struct its_ite *ite = iter->ite; + + if (!ite) + return 0; + + if (list_is_first(&ite->ite_list, &dev->itt_head)) { + seq_printf(s, "\n"); + seq_printf(s, "Device ID: 0x%x, Event ID Range: [0 - %llu]\n", + dev->device_id, BIT_ULL(dev->num_eventid_bits) - 1); + seq_printf(s, "EVENT_ID INTID HWINTID TARGET COL_ID HW\n"); + seq_printf(s, "-----------------------------------------------\n"); + } + + if (ite->irq && ite->collection) { + seq_printf(s, "%8u %8u %8u %8u %8u %2d\n", + ite->event_id, ite->irq->intid, ite->irq->hwintid, + ite->collection->target_addr, + ite->collection->collection_id, ite->irq->hw); + } + + return 0; +} + +static const struct seq_operations vgic_its_debug_sops = { + .start = vgic_its_debug_start, + .next = vgic_its_debug_next, + .stop = vgic_its_debug_stop, + .show = vgic_its_debug_show +}; + +DEFINE_SEQ_ATTRIBUTE(vgic_its_debug); + +/** + * vgic_its_debug_init - Initializes the debugfs interface for VGIC ITS. + * @dev: The KVM device structure. + * + * This function creates a debugfs file named "vgic-its-state@%its_base" + * to expose the ITS table information. + * + * Return: 0 on success. + */ +int vgic_its_debug_init(struct kvm_device *dev) +{ + struct vgic_its *its = dev->private; + char *name; + + name = kasprintf(GFP_KERNEL, "vgic-its-state@%llx", (u64)its->vgic_its_base); + if (!name) + return -ENOMEM; + + debugfs_create_file(name, 0444, dev->kvm->debugfs_dentry, its, &vgic_its_debug_fops); + + kfree(name); + return 0; +} + +void vgic_its_debug_destroy(struct kvm_device *dev) +{ +} diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c new file mode 100644 index 000000000000..dc9f9db31026 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -0,0 +1,754 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include <linux/uaccess.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_mmu.h> +#include "vgic.h" + +/* + * Initialization rules: there are multiple stages to the vgic + * initialization, both for the distributor and the CPU interfaces. The basic + * idea is that even though the VGIC is not functional or not requested from + * user space, the critical path of the run loop can still call VGIC functions + * that just won't do anything, without them having to check additional + * initialization flags to ensure they don't look at uninitialized data + * structures. + * + * Distributor: + * + * - kvm_vgic_early_init(): initialization of static data that doesn't + * depend on any sizing information or emulation type. No allocation + * is allowed there. + * + * - vgic_init(): allocation and initialization of the generic data + * structures that depend on sizing information (number of CPUs, + * number of interrupts). Also initializes the vcpu specific data + * structures. Can be executed lazily for GICv2. + * + * CPU Interface: + * + * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend + * on any sizing information. Private interrupts are allocated if not + * already allocated at vgic-creation time. + */ + +/* EARLY INIT */ + +/** + * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures + * @kvm: The VM whose VGIC districutor should be initialized + * + * Only do initialization of static structures that don't require any + * allocation or sizing information from userspace. vgic_init() called + * kvm_vgic_dist_init() which takes care of the rest. + */ +void kvm_vgic_early_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); +} + +/* CREATION */ + +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); + +/** + * kvm_vgic_create: triggered by the instantiation of the VGIC device by + * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) + * or through the generic KVM_CREATE_DEVICE API ioctl. + * irqchip_in_kernel() tells you if this function succeeded or not. + * @kvm: kvm struct pointer + * @type: KVM_DEV_TYPE_ARM_VGIC_V[23] + */ +int kvm_vgic_create(struct kvm *kvm, u32 type) +{ + struct kvm_vcpu *vcpu; + u64 aa64pfr0, pfr1; + unsigned long i; + int ret; + + /* + * This function is also called by the KVM_CREATE_IRQCHIP handler, + * which had no chance yet to check the availability of the GICv2 + * emulation. So check this here again. KVM_CREATE_DEVICE does + * the proper checks already. + */ + if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && + !kvm_vgic_global_state.can_emulate_gicv2) + return -ENODEV; + + /* + * Ensure mutual exclusion with vCPU creation and any vCPU ioctls by: + * + * - Holding kvm->lock to prevent KVM_CREATE_VCPU from reaching + * kvm_arch_vcpu_precreate() and ensuring created_vcpus is stable. + * This alone is insufficient, as kvm_vm_ioctl_create_vcpu() drops + * the kvm->lock before completing the vCPU creation. + */ + lockdep_assert_held(&kvm->lock); + + /* + * - Acquiring the vCPU mutex for every *online* vCPU to prevent + * concurrent vCPU ioctls for vCPUs already visible to userspace. + */ + ret = -EBUSY; + if (kvm_trylock_all_vcpus(kvm)) + return ret; + + /* + * - Taking the config_lock which protects VGIC data structures such + * as the per-vCPU arrays of private IRQs (SGIs, PPIs). + */ + mutex_lock(&kvm->arch.config_lock); + + /* + * - Bailing on the entire thing if a vCPU is in the middle of creation, + * dropped the kvm->lock, but hasn't reached kvm_arch_vcpu_create(). + * + * The whole combination of this guarantees that no vCPU can get into + * KVM with a VGIC configuration inconsistent with the VM's VGIC. + */ + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) + goto out_unlock; + + if (irqchip_in_kernel(kvm)) { + ret = -EEXIST; + goto out_unlock; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu_has_run_once(vcpu)) + goto out_unlock; + } + ret = 0; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + kvm->max_vcpus = VGIC_V2_MAX_CPUS; + else + kvm->max_vcpus = VGIC_V3_MAX_CPUS; + + if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) { + ret = -E2BIG; + goto out_unlock; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + ret = vgic_allocate_private_irqs_locked(vcpu, type); + if (ret) + break; + } + + if (ret) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + } + + goto out_unlock; + } + + kvm->arch.vgic.in_kernel = true; + kvm->arch.vgic.vgic_model = type; + kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST; + + kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; + + aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) { + kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; + } else { + INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); + aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); + } + + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); + + if (type == KVM_DEV_TYPE_ARM_VGIC_V3) + kvm->arch.vgic.nassgicap = system_supports_direct_sgis(); + +out_unlock: + mutex_unlock(&kvm->arch.config_lock); + kvm_unlock_all_vcpus(kvm); + return ret; +} + +/* INIT/DESTROY */ + +/** + * kvm_vgic_dist_init: initialize the dist data structures + * @kvm: kvm struct pointer + * @nr_spis: number of spis, frozen by caller + */ +static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); + int i; + + dist->active_spis = (atomic_t)ATOMIC_INIT(0); + dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); + if (!dist->spis) + return -ENOMEM; + + /* + * In the following code we do not take the irq struct lock since + * no other action on irq structs can happen while the VGIC is + * not initialized yet: + * If someone wants to inject an interrupt or does a MMIO access, we + * require prior initialization in case of a virtual GICv3 or trigger + * initialization when using a virtual GICv2. + */ + for (i = 0; i < nr_spis; i++) { + struct vgic_irq *irq = &dist->spis[i]; + + irq->intid = i + VGIC_NR_PRIVATE_IRQS; + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + irq->vcpu = NULL; + irq->target_vcpu = vcpu0; + refcount_set(&irq->refcount, 0); + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->targets = 0; + irq->group = 0; + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->mpidr = 0; + irq->group = 1; + break; + default: + kfree(dist->spis); + dist->spis = NULL; + return -EINVAL; + } + } + return 0; +} + +/* Default GICv3 Maintenance Interrupt INTID, as per SBSA */ +#define DEFAULT_MI_INTID 25 + +int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu) +{ + int ret; + + guard(mutex)(&vcpu->kvm->arch.config_lock); + + /* + * Matching the tradition established with the timers, provide + * a default PPI for the maintenance interrupt. It makes + * things easier to reason about. + */ + if (vcpu->kvm->arch.vgic.mi_intid == 0) + vcpu->kvm->arch.vgic.mi_intid = DEFAULT_MI_INTID; + ret = kvm_vgic_set_owner(vcpu, vcpu->kvm->arch.vgic.mi_intid, vcpu); + + return ret; +} + +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + int i; + + lockdep_assert_held(&vcpu->kvm->arch.config_lock); + + if (vgic_cpu->private_irqs) + return 0; + + vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS, + sizeof(struct vgic_irq), + GFP_KERNEL_ACCOUNT); + + if (!vgic_cpu->private_irqs) + return -ENOMEM; + + /* + * Enable and configure all SGIs to be edge-triggered and + * configure all PPIs as level-triggered. + */ + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + irq->intid = i; + irq->vcpu = NULL; + irq->target_vcpu = vcpu; + refcount_set(&irq->refcount, 0); + if (vgic_irq_is_sgi(i)) { + /* SGIs */ + irq->enabled = 1; + irq->config = VGIC_CONFIG_EDGE; + } else { + /* PPIs */ + irq->config = VGIC_CONFIG_LEVEL; + } + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->group = 1; + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->group = 0; + irq->targets = BIT(vcpu->vcpu_id); + break; + } + } + + return 0; +} + +static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type) +{ + int ret; + + mutex_lock(&vcpu->kvm->arch.config_lock); + ret = vgic_allocate_private_irqs_locked(vcpu, type); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return ret; +} + +/** + * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data + * structures and register VCPU-specific KVM iodevs + * + * @vcpu: pointer to the VCPU being created and initialized + * + * Only do initialization, but do not actually enable the + * VGIC CPU interface + */ +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int ret = 0; + + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + raw_spin_lock_init(&vgic_cpu->ap_list_lock); + atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + + if (!irqchip_in_kernel(vcpu->kvm)) + return 0; + + ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model); + if (ret) + return ret; + + /* + * If we are creating a VCPU with a GICv3 we must also register the + * KVM io device for the redistributor that belongs to this VCPU. + */ + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + mutex_lock(&vcpu->kvm->slots_lock); + ret = vgic_register_redist_iodev(vcpu); + mutex_unlock(&vcpu->kvm->slots_lock); + } + return ret; +} + +static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_reset(vcpu); + else + vgic_v3_reset(vcpu); +} + +/* + * vgic_init: allocates and initializes dist and vcpu data structures + * depending on two dimensioning parameters: + * - the number of spis + * - the number of vcpus + * The function is generally called when nr_spis has been explicitly set + * by the guest through the KVM DEVICE API. If not nr_spis is set to 256. + * vgic_initialized() returns true when this function has succeeded. + */ +int vgic_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int ret = 0; + unsigned long idx; + + lockdep_assert_held(&kvm->arch.config_lock); + + if (vgic_initialized(kvm)) + return 0; + + /* Are we also in the middle of creating a VCPU? */ + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) + return -EBUSY; + + /* freeze the number of spis */ + if (!dist->nr_spis) + dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; + + ret = kvm_vgic_dist_init(kvm, dist->nr_spis); + if (ret) + goto out; + + /* + * Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs, + * vLPIs) is supported. + */ + if (vgic_supports_direct_irqs(kvm)) { + ret = vgic_v4_init(kvm); + if (ret) + goto out; + } + + kvm_for_each_vcpu(idx, vcpu, kvm) + kvm_vgic_vcpu_reset(vcpu); + + ret = kvm_vgic_setup_default_irq_routing(kvm); + if (ret) + goto out; + + vgic_debug_init(kvm); + dist->initialized = true; +out: + return ret; +} + +static void kvm_vgic_dist_destroy(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_redist_region *rdreg, *next; + + dist->ready = false; + dist->initialized = false; + + kfree(dist->spis); + dist->spis = NULL; + dist->nr_spis = 0; + dist->vgic_dist_base = VGIC_ADDR_UNDEF; + + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) + vgic_v3_free_redist_region(kvm, rdreg); + INIT_LIST_HEAD(&dist->rd_regions); + } else { + dist->vgic_cpu_base = VGIC_ADDR_UNDEF; + } + + if (vgic_supports_direct_irqs(kvm)) + vgic_v4_teardown(kvm); + + xa_destroy(&dist->lpi_xa); +} + +static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + /* + * Retire all pending LPIs on this vcpu anyway as we're + * going to destroy it. + */ + vgic_flush_pending_lpis(vcpu); + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + /* + * If this vCPU is being destroyed because of a failed creation + * then unregister the redistributor to avoid leaving behind a + * dangling pointer to the vCPU struct. + * + * vCPUs that have been successfully created (i.e. added to + * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as + * this function gets called while holding kvm->arch.config_lock + * in the VM teardown path and would otherwise introduce a lock + * inversion w.r.t. kvm->srcu. + * + * vCPUs that failed creation are torn down outside of the + * kvm->arch.config_lock and do not get unregistered in + * kvm_vgic_destroy(), meaning it is both safe and necessary to + * do so here. + */ + if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu) + vgic_unregister_redist_iodev(vcpu); + + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + } +} + +void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->slots_lock); + __kvm_vgic_vcpu_destroy(vcpu); + mutex_unlock(&kvm->slots_lock); +} + +void kvm_vgic_destroy(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->arch.config_lock); + + vgic_debug_destroy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + __kvm_vgic_vcpu_destroy(vcpu); + + kvm_vgic_dist_destroy(kvm); + + mutex_unlock(&kvm->arch.config_lock); + + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + kvm_for_each_vcpu(i, vcpu, kvm) + vgic_unregister_redist_iodev(vcpu); + + mutex_unlock(&kvm->slots_lock); +} + +/** + * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest + * is a GICv2. A GICv3 must be explicitly initialized by userspace using the + * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group. + * @kvm: kvm struct pointer + */ +int vgic_lazy_init(struct kvm *kvm) +{ + int ret = 0; + + if (unlikely(!vgic_initialized(kvm))) { + /* + * We only provide the automatic initialization of the VGIC + * for the legacy case of a GICv2. Any other type must + * be explicitly initialized once setup with the respective + * KVM device call. + */ + if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) + return -EBUSY; + + mutex_lock(&kvm->arch.config_lock); + ret = vgic_init(kvm); + mutex_unlock(&kvm->arch.config_lock); + } + + return ret; +} + +/* RESOURCE MAPPING */ + +/** + * kvm_vgic_map_resources - map the MMIO regions + * @kvm: kvm struct pointer + * + * Map the MMIO regions depending on the VGIC model exposed to the guest + * called on the first VCPU run. + * Also map the virtual CPU interface into the VM. + * v2 calls vgic_init() if not already done. + * v3 and derivatives return an error if the VGIC is not initialized. + */ +int kvm_vgic_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + enum vgic_type type; + gpa_t dist_base; + int ret = 0; + + if (likely(smp_load_acquire(&dist->ready))) + return 0; + + mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->arch.config_lock); + if (dist->ready) + goto out; + + if (!irqchip_in_kernel(kvm)) + goto out; + + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { + ret = vgic_v2_map_resources(kvm); + type = VGIC_V2; + } else { + ret = vgic_v3_map_resources(kvm); + type = VGIC_V3; + } + + if (ret) + goto out; + + dist_base = dist->vgic_dist_base; + mutex_unlock(&kvm->arch.config_lock); + + ret = vgic_register_dist_iodev(kvm, dist_base, type); + if (ret) { + kvm_err("Unable to register VGIC dist MMIO regions\n"); + goto out_slots; + } + + smp_store_release(&dist->ready, true); + goto out_slots; +out: + mutex_unlock(&kvm->arch.config_lock); +out_slots: + if (ret) + kvm_vm_dead(kvm); + + mutex_unlock(&kvm->slots_lock); + + return ret; +} + +/* GENERIC PROBE */ + +void kvm_vgic_cpu_up(void) +{ + enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); +} + + +void kvm_vgic_cpu_down(void) +{ + disable_percpu_irq(kvm_vgic_global_state.maint_irq); +} + +static irqreturn_t vgic_maintenance_handler(int irq, void *data) +{ + struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)data; + + /* + * We cannot rely on the vgic maintenance interrupt to be + * delivered synchronously. This means we can only use it to + * exit the VM, and we perform the handling of EOIed + * interrupts on the exit path (see vgic_fold_lr_state). + * + * Of course, NV throws a wrench in this plan, and needs + * something special. + */ + if (vcpu && vgic_state_is_nested(vcpu)) + vgic_v3_handle_nested_maint_irq(vcpu); + + return IRQ_HANDLED; +} + +static struct gic_kvm_info *gic_kvm_info; + +void __init vgic_set_kvm_info(const struct gic_kvm_info *info) +{ + BUG_ON(gic_kvm_info != NULL); + gic_kvm_info = kmalloc(sizeof(*info), GFP_KERNEL); + if (gic_kvm_info) + *gic_kvm_info = *info; +} + +/** + * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware + * + * For a specific CPU, initialize the GIC VE hardware. + */ +void kvm_vgic_init_cpu_hardware(void) +{ + BUG_ON(preemptible()); + + /* + * We want to make sure the list registers start out clear so that we + * only have the program the used registers. + */ + if (kvm_vgic_global_state.type == VGIC_V2) { + vgic_v2_init_lrs(); + } else if (kvm_vgic_global_state.type == VGIC_V3 || + kvm_vgic_global_state.has_gcie_v3_compat) { + kvm_call_hyp(__vgic_v3_init_lrs); + } +} + +/** + * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable + * according to the host GIC model. Accordingly calls either + * vgic_v2/v3_probe which registers the KVM_DEVICE that can be + * instantiated by a guest later on . + */ +int kvm_vgic_hyp_init(void) +{ + bool has_mask; + int ret; + + if (!gic_kvm_info) + return -ENODEV; + + has_mask = !gic_kvm_info->no_maint_irq_mask; + + if (has_mask && !gic_kvm_info->maint_irq) { + kvm_err("No vgic maintenance irq\n"); + return -ENXIO; + } + + /* + * If we get one of these oddball non-GICs, taint the kernel, + * as we have no idea of how they *really* behave. + */ + if (gic_kvm_info->no_hw_deactivation) { + kvm_info("Non-architectural vgic, tainting kernel\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + kvm_vgic_global_state.no_hw_deactivation = true; + } + + switch (gic_kvm_info->type) { + case GIC_V2: + ret = vgic_v2_probe(gic_kvm_info); + break; + case GIC_V3: + ret = vgic_v3_probe(gic_kvm_info); + if (!ret) { + static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); + kvm_info("GIC system register CPU interface enabled\n"); + } + break; + case GIC_V5: + ret = vgic_v5_probe(gic_kvm_info); + break; + default: + ret = -ENODEV; + } + + kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq; + + kfree(gic_kvm_info); + gic_kvm_info = NULL; + + if (ret) + return ret; + + if (!has_mask && !kvm_vgic_global_state.maint_irq) + return 0; + + ret = request_percpu_irq(kvm_vgic_global_state.maint_irq, + vgic_maintenance_handler, + "vgic", kvm_get_running_vcpus()); + if (ret) { + kvm_err("Cannot register interrupt %d\n", + kvm_vgic_global_state.maint_irq); + return ret; + } + + kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); + return 0; +} diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c new file mode 100644 index 000000000000..c314c016659a --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <trace/events/kvm.h> +#include <kvm/arm_vgic.h> +#include "vgic.h" + +/* + * vgic_irqfd_set_irq: inject the IRQ corresponding to the + * irqchip routing entry + * + * This is the entry point for irqfd IRQ injection + */ +static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS; + + if (!vgic_valid_spi(kvm, spi_id)) + return -EINVAL; + return kvm_vgic_inject_irq(kvm, NULL, spi_id, level, NULL); +} + +/** + * kvm_set_routing_entry: populate a kvm routing entry + * from a user routing entry + * + * @kvm: the VM this entry is applied to + * @e: kvm kernel routing entry handle + * @ue: user api routing entry handle + * return 0 on success, -EINVAL on errors. + */ +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = vgic_irqfd_set_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) || + (e->irqchip.irqchip >= KVM_NR_IRQCHIPS)) + goto out; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + e->msi.flags = ue->flags; + e->msi.devid = ue->u.msi.devid; + break; + default: + goto out; + } + r = 0; +out: + return r; +} + +static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm_msi *msi) +{ + msi->address_lo = e->msi.address_lo; + msi->address_hi = e->msi.address_hi; + msi->data = e->msi.data; + msi->flags = e->msi.flags; + msi->devid = e->msi.devid; +} + +/* + * kvm_set_msi: inject the MSI corresponding to the + * MSI routing entry + * + * This is the entry point for irqfd MSI injection + * and userspace MSI injection. + */ +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + struct kvm_msi msi; + + if (!vgic_has_its(kvm)) + return -ENODEV; + + if (!level) + return -1; + + kvm_populate_msi(e, &msi); + return vgic_its_inject_msi(kvm, &msi); +} + +/* + * kvm_arch_set_irq_inatomic: fast-path for irqfd injection + */ +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + if (!level) + return -EWOULDBLOCK; + + switch (e->type) { + case KVM_IRQ_ROUTING_MSI: { + struct kvm_msi msi; + + if (!vgic_has_its(kvm)) + break; + + kvm_populate_msi(e, &msi); + return vgic_its_inject_cached_translation(kvm, &msi); + } + + case KVM_IRQ_ROUTING_IRQCHIP: + /* + * Injecting SPIs is always possible in atomic context + * as long as the damn vgic is initialized. + */ + if (unlikely(!vgic_initialized(kvm))) + break; + return vgic_irqfd_set_irq(e, kvm, irq_source_id, 1, line_status); + } + + return -EWOULDBLOCK; +} + +int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) +{ + struct kvm_irq_routing_entry *entries; + struct vgic_dist *dist = &kvm->arch.vgic; + u32 nr = dist->nr_spis; + int i, ret; + + entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL_ACCOUNT); + if (!entries) + return -ENOMEM; + + for (i = 0; i < nr; i++) { + entries[i].gsi = i; + entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; + entries[i].u.irqchip.irqchip = 0; + entries[i].u.irqchip.pin = i; + } + ret = kvm_set_irq_routing(kvm, entries, nr, 0); + kfree(entries); + return ret; +} diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c new file mode 100644 index 000000000000..3f1c4b10fed9 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -0,0 +1,2815 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GICv3 ITS emulation + * + * Copyright (C) 2015,2016 ARM Ltd. + * Author: Andre Przywara <andre.przywara@arm.com> + */ + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/uaccess.h> +#include <linux/list_sort.h> + +#include <linux/irqchip/arm-gic-v3.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +static struct kvm_device_ops kvm_arm_vgic_its_ops; + +static int vgic_its_save_tables_v0(struct vgic_its *its); +static int vgic_its_restore_tables_v0(struct vgic_its *its); +static int vgic_its_commit_v0(struct vgic_its *its); +static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, + struct kvm_vcpu *filter_vcpu, bool needs_inv); + +#define vgic_its_read_entry_lock(i, g, valp, t) \ + ({ \ + int __sz = vgic_its_get_abi(i)->t##_esz; \ + struct kvm *__k = (i)->dev->kvm; \ + int __ret; \ + \ + BUILD_BUG_ON(NR_ITS_ABIS == 1 && \ + sizeof(*(valp)) != ABI_0_ESZ); \ + if (NR_ITS_ABIS > 1 && \ + KVM_BUG_ON(__sz != sizeof(*(valp)), __k)) \ + __ret = -EINVAL; \ + else \ + __ret = kvm_read_guest_lock(__k, (g), \ + valp, __sz); \ + __ret; \ + }) + +#define vgic_its_write_entry_lock(i, g, val, t) \ + ({ \ + int __sz = vgic_its_get_abi(i)->t##_esz; \ + struct kvm *__k = (i)->dev->kvm; \ + typeof(val) __v = (val); \ + int __ret; \ + \ + BUILD_BUG_ON(NR_ITS_ABIS == 1 && \ + sizeof(__v) != ABI_0_ESZ); \ + if (NR_ITS_ABIS > 1 && \ + KVM_BUG_ON(__sz != sizeof(__v), __k)) \ + __ret = -EINVAL; \ + else \ + __ret = vgic_write_guest_lock(__k, (g), \ + &__v, __sz); \ + __ret; \ + }) + +/* + * Creates a new (reference to a) struct vgic_irq for a given LPI. + * If this LPI is already mapped on another ITS, we increase its refcount + * and return a pointer to the existing structure. + * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq. + * This function returns a pointer to the _unlocked_ structure. + */ +static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, + struct kvm_vcpu *vcpu) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; + unsigned long flags; + int ret; + + /* In this case there is no put, since we keep the reference. */ + if (irq) + return irq; + + irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); + if (!irq) + return ERR_PTR(-ENOMEM); + + ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); + if (ret) { + kfree(irq); + return ERR_PTR(ret); + } + + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + + irq->config = VGIC_CONFIG_EDGE; + refcount_set(&irq->refcount, 1); + irq->intid = intid; + irq->target_vcpu = vcpu; + irq->group = 1; + + xa_lock_irqsave(&dist->lpi_xa, flags); + + /* + * There could be a race with another vgic_add_lpi(), so we need to + * check that we don't add a second list entry with the same LPI. + */ + oldirq = xa_load(&dist->lpi_xa, intid); + if (vgic_try_get_irq_ref(oldirq)) { + /* Someone was faster with adding this LPI, lets use that. */ + kfree(irq); + irq = oldirq; + } else { + ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0)); + } + + xa_unlock_irqrestore(&dist->lpi_xa, flags); + + if (ret) { + xa_release(&dist->lpi_xa, intid); + kfree(irq); + + return ERR_PTR(ret); + } + + /* + * We "cache" the configuration table entries in our struct vgic_irq's. + * However we only have those structs for mapped IRQs, so we read in + * the respective config data from memory here upon mapping the LPI. + * + * Should any of these fail, behave as if we couldn't create the LPI + * by dropping the refcount and returning the error. + */ + ret = update_lpi_config(kvm, irq, NULL, false); + if (ret) { + vgic_put_irq(kvm, irq); + return ERR_PTR(ret); + } + + ret = vgic_v3_lpi_sync_pending_status(kvm, irq); + if (ret) { + vgic_put_irq(kvm, irq); + return ERR_PTR(ret); + } + + return irq; +} + +/** + * struct vgic_its_abi - ITS abi ops and settings + * @cte_esz: collection table entry size + * @dte_esz: device table entry size + * @ite_esz: interrupt translation table entry size + * @save_tables: save the ITS tables into guest RAM + * @restore_tables: restore the ITS internal structs from tables + * stored in guest RAM + * @commit: initialize the registers which expose the ABI settings, + * especially the entry sizes + */ +struct vgic_its_abi { + int cte_esz; + int dte_esz; + int ite_esz; + int (*save_tables)(struct vgic_its *its); + int (*restore_tables)(struct vgic_its *its); + int (*commit)(struct vgic_its *its); +}; + +#define ABI_0_ESZ 8 +#define ESZ_MAX ABI_0_ESZ + +static const struct vgic_its_abi its_table_abi_versions[] = { + [0] = { + .cte_esz = ABI_0_ESZ, + .dte_esz = ABI_0_ESZ, + .ite_esz = ABI_0_ESZ, + .save_tables = vgic_its_save_tables_v0, + .restore_tables = vgic_its_restore_tables_v0, + .commit = vgic_its_commit_v0, + }, +}; + +#define NR_ITS_ABIS ARRAY_SIZE(its_table_abi_versions) + +inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its) +{ + return &its_table_abi_versions[its->abi_rev]; +} + +static int vgic_its_set_abi(struct vgic_its *its, u32 rev) +{ + const struct vgic_its_abi *abi; + + its->abi_rev = rev; + abi = vgic_its_get_abi(its); + return abi->commit(its); +} + +/* + * Find and returns a device in the device table for an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_device *find_its_device(struct vgic_its *its, u32 device_id) +{ + struct its_device *device; + + list_for_each_entry(device, &its->device_list, dev_list) + if (device_id == device->device_id) + return device; + + return NULL; +} + +/* + * Find and returns an interrupt translation table entry (ITTE) for a given + * Device ID/Event ID pair on an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, + u32 event_id) +{ + struct its_device *device; + struct its_ite *ite; + + device = find_its_device(its, device_id); + if (device == NULL) + return NULL; + + list_for_each_entry(ite, &device->itt_head, ite_list) + if (ite->event_id == event_id) + return ite; + + return NULL; +} + +/* To be used as an iterator this macro misses the enclosing parentheses */ +#define for_each_lpi_its(dev, ite, its) \ + list_for_each_entry(dev, &(its)->device_list, dev_list) \ + list_for_each_entry(ite, &(dev)->itt_head, ite_list) + +#define GIC_LPI_OFFSET 8192 + +#define VITS_TYPER_IDBITS 16 +#define VITS_MAX_EVENTID (BIT(VITS_TYPER_IDBITS) - 1) +#define VITS_TYPER_DEVBITS 16 +#define VITS_MAX_DEVID (BIT(VITS_TYPER_DEVBITS) - 1) +#define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1) +#define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1) + +/* + * Finds and returns a collection in the ITS collection table. + * Must be called with the its_lock mutex held. + */ +static struct its_collection *find_collection(struct vgic_its *its, int coll_id) +{ + struct its_collection *collection; + + list_for_each_entry(collection, &its->collection_list, coll_list) { + if (coll_id == collection->collection_id) + return collection; + } + + return NULL; +} + +#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) +#define LPI_PROP_PRIORITY(p) ((p) & 0xfc) + +/* + * Reads the configuration data for a given LPI from guest memory and + * updates the fields in struct vgic_irq. + * If filter_vcpu is not NULL, applies only if the IRQ is targeting this + * VCPU. Unconditionally applies if filter_vcpu is NULL. + */ +static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, + struct kvm_vcpu *filter_vcpu, bool needs_inv) +{ + u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); + u8 prop; + int ret; + unsigned long flags; + + ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET, + &prop, 1); + + if (ret) + return ret; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { + irq->priority = LPI_PROP_PRIORITY(prop); + irq->enabled = LPI_PROP_ENABLE_BIT(prop); + + if (!irq->hw) { + vgic_queue_irq_unlock(kvm, irq, flags); + return 0; + } + } + + if (irq->hw) + ret = its_prop_update_vlpi(irq->host_irq, prop, needs_inv); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + return ret; +} + +static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) +{ + struct its_vlpi_map map; + int ret; + + guard(raw_spinlock_irqsave)(&irq->irq_lock); + irq->target_vcpu = vcpu; + + if (!irq->hw) + return 0; + + ret = its_get_vlpi(irq->host_irq, &map); + if (ret) + return ret; + + if (map.vpe) + atomic_dec(&map.vpe->vlpi_count); + + map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + atomic_inc(&map.vpe->vlpi_count); + return its_map_vlpi(irq->host_irq, &map); +} + +static struct kvm_vcpu *collection_to_vcpu(struct kvm *kvm, + struct its_collection *col) +{ + return kvm_get_vcpu_by_id(kvm, col->target_addr); +} + +/* + * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI + * is targeting) to the VGIC's view, which deals with target VCPUs. + * Needs to be called whenever either the collection for a LPIs has + * changed or the collection itself got retargeted. + */ +static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite) +{ + struct kvm_vcpu *vcpu; + + if (!its_is_collection_mapped(ite->collection)) + return; + + vcpu = collection_to_vcpu(kvm, ite->collection); + update_affinity(ite->irq, vcpu); +} + +/* + * Updates the target VCPU for every LPI targeting this collection. + * Must be called with the its_lock mutex held. + */ +static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, + struct its_collection *coll) +{ + struct its_device *device; + struct its_ite *ite; + + for_each_lpi_its(device, ite, its) { + if (ite->collection != coll) + continue; + + update_affinity_ite(kvm, ite); + } +} + +static u32 max_lpis_propbaser(u64 propbaser) +{ + int nr_idbits = (propbaser & 0x1f) + 1; + + return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS); +} + +/* + * Sync the pending table pending bit of LPIs targeting @vcpu + * with our own data structures. This relies on the LPI being + * mapped before. + */ +static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) +{ + gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + unsigned long intid, flags; + struct vgic_irq *irq; + int last_byte_offset = -1; + int ret = 0; + u8 pendmask; + + xa_for_each(&dist->lpi_xa, intid, irq) { + int byte_offset, bit_nr; + + byte_offset = intid / BITS_PER_BYTE; + bit_nr = intid % BITS_PER_BYTE; + + /* + * For contiguously allocated LPIs chances are we just read + * this very same byte in the last iteration. Reuse that. + */ + if (byte_offset != last_byte_offset) { + ret = kvm_read_guest_lock(vcpu->kvm, + pendbase + byte_offset, + &pendmask, 1); + if (ret) + return ret; + + last_byte_offset = byte_offset; + } + + irq = vgic_get_irq(vcpu->kvm, intid); + if (!irq) + continue; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->target_vcpu == vcpu) + irq->pending_latch = pendmask & (1U << bit_nr); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + vgic_put_irq(vcpu->kvm, irq); + } + + return ret; +} + +static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 reg = GITS_TYPER_PLPIS; + + /* + * We use linear CPU numbers for redistributor addressing, + * so GITS_TYPER.PTA is 0. + * Also we force all PROPBASER registers to be the same, so + * CommonLPIAff is 0 as well. + * To avoid memory waste in the guest, we keep the number of IDBits and + * DevBits low - as least for the time being. + */ + reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT; + reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT; + reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT; + + return extract_bytes(reg, addr & 7, len); +} + +static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u32 val; + + val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK; + val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM; + return val; +} + +static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 rev = GITS_IIDR_REV(val); + + if (rev >= NR_ITS_ABIS) + return -EINVAL; + return vgic_its_set_abi(its, rev); +} + +static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + switch (addr & 0xffff) { + case GITS_PIDR0: + return 0x92; /* part number, bits[7:0] */ + case GITS_PIDR1: + return 0xb4; /* part number, bits[11:8] */ + case GITS_PIDR2: + return GIC_PIDR2_ARCH_GICv3 | 0x0b; + case GITS_PIDR4: + return 0x40; /* This is a 64K software visible page */ + /* The following are the ID registers for (any) GIC. */ + case GITS_CIDR0: + return 0x0d; + case GITS_CIDR1: + return 0xf0; + case GITS_CIDR2: + return 0x05; + case GITS_CIDR3: + return 0xb1; + } + + return 0; +} + +static struct vgic_its *__vgic_doorbell_to_its(struct kvm *kvm, gpa_t db) +{ + struct kvm_io_device *kvm_io_dev; + struct vgic_io_device *iodev; + + kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, db); + if (!kvm_io_dev) + return ERR_PTR(-EINVAL); + + if (kvm_io_dev->ops != &kvm_io_gic_ops) + return ERR_PTR(-EINVAL); + + iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); + if (iodev->iodev_type != IODEV_ITS) + return ERR_PTR(-EINVAL); + + return iodev->its; +} + +static unsigned long vgic_its_cache_key(u32 devid, u32 eventid) +{ + return (((unsigned long)devid) << VITS_TYPER_IDBITS) | eventid; + +} + +static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, + u32 devid, u32 eventid) +{ + unsigned long cache_key = vgic_its_cache_key(devid, eventid); + struct vgic_its *its; + struct vgic_irq *irq; + + if (devid > VITS_MAX_DEVID || eventid > VITS_MAX_EVENTID) + return NULL; + + its = __vgic_doorbell_to_its(kvm, db); + if (IS_ERR(its)) + return NULL; + + rcu_read_lock(); + + irq = xa_load(&its->translation_cache, cache_key); + if (!vgic_try_get_irq_ref(irq)) + irq = NULL; + + rcu_read_unlock(); + + return irq; +} + +static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, + struct vgic_irq *irq) +{ + unsigned long cache_key = vgic_its_cache_key(devid, eventid); + struct vgic_irq *old; + + /* Do not cache a directly injected interrupt */ + if (irq->hw) + return; + + /* + * The irq refcount is guaranteed to be nonzero while holding the + * its_lock, as the ITE (and the reference it holds) cannot be freed. + */ + lockdep_assert_held(&its->its_lock); + vgic_get_irq_ref(irq); + + old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); + + /* + * Put the reference taken on @irq if the store fails. Intentionally do + * not return the error as the translation cache is best effort. + */ + if (xa_is_err(old)) { + vgic_put_irq(kvm, irq); + return; + } + + /* + * We could have raced with another CPU caching the same + * translation behind our back, ensure we don't leak a + * reference if that is the case. + */ + if (old) + vgic_put_irq(kvm, old); +} + +static void vgic_its_invalidate_cache(struct vgic_its *its) +{ + struct kvm *kvm = its->dev->kvm; + struct vgic_irq *irq; + unsigned long idx; + + xa_for_each(&its->translation_cache, idx, irq) { + xa_erase(&its->translation_cache, idx); + vgic_put_irq(kvm, irq); + } +} + +void vgic_its_invalidate_all_caches(struct kvm *kvm) +{ + struct kvm_device *dev; + struct vgic_its *its; + + rcu_read_lock(); + + list_for_each_entry_rcu(dev, &kvm->devices, vm_node) { + if (dev->ops != &kvm_arm_vgic_its_ops) + continue; + + its = dev->private; + vgic_its_invalidate_cache(its); + } + + rcu_read_unlock(); +} + +int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, struct vgic_irq **irq) +{ + struct kvm_vcpu *vcpu; + struct its_ite *ite; + + if (!its->enabled) + return -EBUSY; + + ite = find_ite(its, devid, eventid); + if (!ite || !its_is_collection_mapped(ite->collection)) + return E_ITS_INT_UNMAPPED_INTERRUPT; + + vcpu = collection_to_vcpu(kvm, ite->collection); + if (!vcpu) + return E_ITS_INT_UNMAPPED_INTERRUPT; + + if (!vgic_lpis_enabled(vcpu)) + return -EBUSY; + + vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq); + + *irq = ite->irq; + return 0; +} + +struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) +{ + u64 address; + + if (!vgic_has_its(kvm)) + return ERR_PTR(-ENODEV); + + if (!(msi->flags & KVM_MSI_VALID_DEVID)) + return ERR_PTR(-EINVAL); + + address = (u64)msi->address_hi << 32 | msi->address_lo; + + return __vgic_doorbell_to_its(kvm, address); +} + +/* + * Find the target VCPU and the LPI number for a given devid/eventid pair + * and make this IRQ pending, possibly injecting it. + * Must be called with the its_lock mutex held. + * Returns 0 on success, a positive error value for any ITS mapping + * related errors and negative error values for generic errors. + */ +static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid) +{ + struct vgic_irq *irq = NULL; + unsigned long flags; + int err; + + err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq); + if (err) + return err; + + if (irq->hw) + return irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, true); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + vgic_queue_irq_unlock(kvm, irq, flags); + + return 0; +} + +int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) +{ + struct vgic_irq *irq; + unsigned long flags; + phys_addr_t db; + + db = (u64)msi->address_hi << 32 | msi->address_lo; + irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data); + if (!irq) + return -EWOULDBLOCK; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + vgic_queue_irq_unlock(kvm, irq, flags); + vgic_put_irq(kvm, irq); + + return 0; +} + +/* + * Queries the KVM IO bus framework to get the ITS pointer from the given + * doorbell address. + * We then call vgic_its_trigger_msi() with the decoded data. + * According to the KVM_SIGNAL_MSI API description returns 1 on success. + */ +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + struct vgic_its *its; + int ret; + + if (!vgic_its_inject_cached_translation(kvm, msi)) + return 1; + + its = vgic_msi_to_its(kvm, msi); + if (IS_ERR(its)) + return PTR_ERR(its); + + mutex_lock(&its->its_lock); + ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data); + mutex_unlock(&its->its_lock); + + if (ret < 0) + return ret; + + /* + * KVM_SIGNAL_MSI demands a return value > 0 for success and 0 + * if the guest has blocked the MSI. So we map any LPI mapping + * related error to that. + */ + if (ret) + return 0; + else + return 1; +} + +/* Requires the its_lock to be held. */ +static void its_free_ite(struct kvm *kvm, struct its_ite *ite) +{ + struct vgic_irq *irq = ite->irq; + list_del(&ite->ite_list); + + /* This put matches the get in vgic_add_lpi. */ + if (irq) { + scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) { + if (irq->hw) + its_unmap_vlpi(ite->irq->host_irq); + + irq->hw = false; + } + + vgic_put_irq(kvm, ite->irq); + } + + kfree(ite); +} + +static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size) +{ + return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1); +} + +#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8) +#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32) +#define its_cmd_get_size(cmd) (its_cmd_mask_field(cmd, 1, 0, 5) + 1) +#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32) +#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32) +#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16) +#define its_cmd_get_ittaddr(cmd) (its_cmd_mask_field(cmd, 2, 8, 44) << 8) +#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32) +#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1) + +/* + * The DISCARD command frees an Interrupt Translation Table Entry (ITTE). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_ite *ite; + + ite = find_ite(its, device_id, event_id); + if (ite && its_is_collection_mapped(ite->collection)) { + struct its_device *device = find_its_device(its, device_id); + int ite_esz = vgic_its_get_abi(its)->ite_esz; + gpa_t gpa = device->itt_addr + ite->event_id * ite_esz; + /* + * Though the spec talks about removing the pending state, we + * don't bother here since we clear the ITTE anyway and the + * pending state is a property of the ITTE struct. + */ + vgic_its_invalidate_cache(its); + + its_free_ite(kvm, ite); + + return vgic_its_write_entry_lock(its, gpa, 0ULL, ite); + } + + return E_ITS_DISCARD_UNMAPPED_INTERRUPT; +} + +/* + * The MOVI command moves an ITTE to a different collection. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct kvm_vcpu *vcpu; + struct its_ite *ite; + struct its_collection *collection; + + ite = find_ite(its, device_id, event_id); + if (!ite) + return E_ITS_MOVI_UNMAPPED_INTERRUPT; + + if (!its_is_collection_mapped(ite->collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + ite->collection = collection; + vcpu = collection_to_vcpu(kvm, collection); + + vgic_its_invalidate_cache(its); + + return update_affinity(ite->irq, vcpu); +} + +static bool __is_visible_gfn_locked(struct vgic_its *its, gpa_t gpa) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int idx; + bool ret; + + idx = srcu_read_lock(&its->dev->kvm->srcu); + ret = kvm_is_visible_gfn(its->dev->kvm, gfn); + srcu_read_unlock(&its->dev->kvm->srcu, idx); + return ret; +} + +/* + * Check whether an ID can be stored into the corresponding guest table. + * For a direct table this is pretty easy, but gets a bit nasty for + * indirect tables. We check whether the resulting guest physical address + * is actually valid (covered by a memslot and guest accessible). + * For this we have to read the respective first level entry. + */ +static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, + gpa_t *eaddr) +{ + int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + u64 indirect_ptr, type = GITS_BASER_TYPE(baser); + phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); + int esz = GITS_BASER_ENTRY_SIZE(baser); + int index; + + switch (type) { + case GITS_BASER_TYPE_DEVICE: + if (id > VITS_MAX_DEVID) + return false; + break; + case GITS_BASER_TYPE_COLLECTION: + /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */ + if (id >= BIT_ULL(16)) + return false; + break; + default: + return false; + } + + if (!(baser & GITS_BASER_INDIRECT)) { + phys_addr_t addr; + + if (id >= (l1_tbl_size / esz)) + return false; + + addr = base + id * esz; + + if (eaddr) + *eaddr = addr; + + return __is_visible_gfn_locked(its, addr); + } + + /* calculate and check the index into the 1st level */ + index = id / (SZ_64K / esz); + if (index >= (l1_tbl_size / sizeof(u64))) + return false; + + /* Each 1st level entry is represented by a 64-bit value. */ + if (kvm_read_guest_lock(its->dev->kvm, + base + index * sizeof(indirect_ptr), + &indirect_ptr, sizeof(indirect_ptr))) + return false; + + indirect_ptr = le64_to_cpu(indirect_ptr); + + /* check the valid bit of the first level entry */ + if (!(indirect_ptr & BIT_ULL(63))) + return false; + + /* Mask the guest physical address and calculate the frame number. */ + indirect_ptr &= GENMASK_ULL(51, 16); + + /* Find the address of the actual entry */ + index = id % (SZ_64K / esz); + indirect_ptr += index * esz; + + if (eaddr) + *eaddr = indirect_ptr; + + return __is_visible_gfn_locked(its, indirect_ptr); +} + +/* + * Check whether an event ID can be stored in the corresponding Interrupt + * Translation Table, which starts at device->itt_addr. + */ +static bool vgic_its_check_event_id(struct vgic_its *its, struct its_device *device, + u32 event_id) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int ite_esz = abi->ite_esz; + gpa_t gpa; + + /* max table size is: BIT_ULL(device->num_eventid_bits) * ite_esz */ + if (event_id >= BIT_ULL(device->num_eventid_bits)) + return false; + + gpa = device->itt_addr + event_id * ite_esz; + return __is_visible_gfn_locked(its, gpa); +} + +/* + * Add a new collection into the ITS collection table. + * Returns 0 on success, and a negative error value for generic errors. + */ +static int vgic_its_alloc_collection(struct vgic_its *its, + struct its_collection **colp, + u32 coll_id) +{ + struct its_collection *collection; + + collection = kzalloc(sizeof(*collection), GFP_KERNEL_ACCOUNT); + if (!collection) + return -ENOMEM; + + collection->collection_id = coll_id; + collection->target_addr = COLLECTION_NOT_MAPPED; + + list_add_tail(&collection->coll_list, &its->collection_list); + *colp = collection; + + return 0; +} + +static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) +{ + struct its_collection *collection; + struct its_device *device; + struct its_ite *ite; + + /* + * Clearing the mapping for that collection ID removes the + * entry from the list. If there wasn't any before, we can + * go home early. + */ + collection = find_collection(its, coll_id); + if (!collection) + return; + + for_each_lpi_its(device, ite, its) + if (ite->collection && + ite->collection->collection_id == coll_id) + ite->collection = NULL; + + list_del(&collection->coll_list); + kfree(collection); +} + +/* Must be called with its_lock mutex held */ +static struct its_ite *vgic_its_alloc_ite(struct its_device *device, + struct its_collection *collection, + u32 event_id) +{ + struct its_ite *ite; + + ite = kzalloc(sizeof(*ite), GFP_KERNEL_ACCOUNT); + if (!ite) + return ERR_PTR(-ENOMEM); + + ite->event_id = event_id; + ite->collection = collection; + + list_add_tail(&ite->ite_list, &device->itt_head); + return ite; +} + +/* + * The MAPTI and MAPI commands map LPIs to ITTEs. + * Must be called with its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_ite *ite; + struct kvm_vcpu *vcpu = NULL; + struct its_device *device; + struct its_collection *collection, *new_coll = NULL; + struct vgic_irq *irq; + int lpi_nr; + + device = find_its_device(its, device_id); + if (!device) + return E_ITS_MAPTI_UNMAPPED_DEVICE; + + if (!vgic_its_check_event_id(its, device, event_id)) + return E_ITS_MAPTI_ID_OOR; + + if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) + lpi_nr = its_cmd_get_physical_id(its_cmd); + else + lpi_nr = event_id; + if (lpi_nr < GIC_LPI_OFFSET || + lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) + return E_ITS_MAPTI_PHYSICALID_OOR; + + /* If there is an existing mapping, behavior is UNPREDICTABLE. */ + if (find_ite(its, device_id, event_id)) + return 0; + + collection = find_collection(its, coll_id); + if (!collection) { + int ret; + + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) + return E_ITS_MAPC_COLLECTION_OOR; + + ret = vgic_its_alloc_collection(its, &collection, coll_id); + if (ret) + return ret; + new_coll = collection; + } + + ite = vgic_its_alloc_ite(device, collection, event_id); + if (IS_ERR(ite)) { + if (new_coll) + vgic_its_free_collection(its, coll_id); + return PTR_ERR(ite); + } + + if (its_is_collection_mapped(collection)) + vcpu = collection_to_vcpu(kvm, collection); + + irq = vgic_add_lpi(kvm, lpi_nr, vcpu); + if (IS_ERR(irq)) { + if (new_coll) + vgic_its_free_collection(its, coll_id); + its_free_ite(kvm, ite); + return PTR_ERR(irq); + } + ite->irq = irq; + + return 0; +} + +/* Requires the its_lock to be held. */ +static void vgic_its_free_device(struct kvm *kvm, struct vgic_its *its, + struct its_device *device) +{ + struct its_ite *ite, *temp; + + /* + * The spec says that unmapping a device with still valid + * ITTEs associated is UNPREDICTABLE. We remove all ITTEs, + * since we cannot leave the memory unreferenced. + */ + list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list) + its_free_ite(kvm, ite); + + vgic_its_invalidate_cache(its); + + list_del(&device->dev_list); + kfree(device); +} + +/* its lock must be held */ +static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) +{ + struct its_device *cur, *temp; + + list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) + vgic_its_free_device(kvm, its, cur); +} + +/* its lock must be held */ +static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its) +{ + struct its_collection *cur, *temp; + + list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list) + vgic_its_free_collection(its, cur->collection_id); +} + +/* Must be called with its_lock mutex held */ +static struct its_device *vgic_its_alloc_device(struct vgic_its *its, + u32 device_id, gpa_t itt_addr, + u8 num_eventid_bits) +{ + struct its_device *device; + + device = kzalloc(sizeof(*device), GFP_KERNEL_ACCOUNT); + if (!device) + return ERR_PTR(-ENOMEM); + + device->device_id = device_id; + device->itt_addr = itt_addr; + device->num_eventid_bits = num_eventid_bits; + INIT_LIST_HEAD(&device->itt_head); + + list_add_tail(&device->dev_list, &its->device_list); + return device; +} + +/* + * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + bool valid = its_cmd_get_validbit(its_cmd); + u8 num_eventid_bits = its_cmd_get_size(its_cmd); + gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); + struct its_device *device; + gpa_t gpa; + + if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa)) + return E_ITS_MAPD_DEVICE_OOR; + + if (valid && num_eventid_bits > VITS_TYPER_IDBITS) + return E_ITS_MAPD_ITTSIZE_OOR; + + device = find_its_device(its, device_id); + + /* + * The spec says that calling MAPD on an already mapped device + * invalidates all cached data for this device. We implement this + * by removing the mapping and re-establishing it. + */ + if (device) + vgic_its_free_device(kvm, its, device); + + /* + * The spec does not say whether unmapping a not-mapped device + * is an error, so we are done in any case. + */ + if (!valid) + return vgic_its_write_entry_lock(its, gpa, 0ULL, dte); + + device = vgic_its_alloc_device(its, device_id, itt_addr, + num_eventid_bits); + + return PTR_ERR_OR_ZERO(device); +} + +/* + * The MAPC command maps collection IDs to redistributors. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u16 coll_id; + struct its_collection *collection; + bool valid; + + valid = its_cmd_get_validbit(its_cmd); + coll_id = its_cmd_get_collection(its_cmd); + + if (!valid) { + vgic_its_free_collection(its, coll_id); + vgic_its_invalidate_cache(its); + } else { + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd)); + if (!vcpu) + return E_ITS_MAPC_PROCNUM_OOR; + + collection = find_collection(its, coll_id); + + if (!collection) { + int ret; + + if (!vgic_its_check_id(its, its->baser_coll_table, + coll_id, NULL)) + return E_ITS_MAPC_COLLECTION_OOR; + + ret = vgic_its_alloc_collection(its, &collection, + coll_id); + if (ret) + return ret; + collection->target_addr = vcpu->vcpu_id; + } else { + collection->target_addr = vcpu->vcpu_id; + update_affinity_collection(kvm, its, collection); + } + } + + return 0; +} + +/* + * The CLEAR command removes the pending state for a particular LPI. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_ite *ite; + + + ite = find_ite(its, device_id, event_id); + if (!ite) + return E_ITS_CLEAR_UNMAPPED_INTERRUPT; + + ite->irq->pending_latch = false; + + if (ite->irq->hw) + return irq_set_irqchip_state(ite->irq->host_irq, + IRQCHIP_STATE_PENDING, false); + + return 0; +} + +int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq) +{ + return update_lpi_config(kvm, irq, NULL, true); +} + +/* + * The INV command syncs the configuration bits from the memory table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_ite *ite; + + + ite = find_ite(its, device_id, event_id); + if (!ite) + return E_ITS_INV_UNMAPPED_INTERRUPT; + + return vgic_its_inv_lpi(kvm, ite->irq); +} + +/** + * vgic_its_invall - invalidate all LPIs targeting a given vcpu + * @vcpu: the vcpu for which the RD is targeted by an invalidation + * + * Contrary to the INVALL command, this targets a RD instead of a + * collection, and we don't need to hold the its_lock, since no ITS is + * involved here. + */ +int vgic_its_invall(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + unsigned long intid; + + xa_for_each(&dist->lpi_xa, intid, irq) { + irq = vgic_get_irq(kvm, intid); + if (!irq) + continue; + + update_lpi_config(kvm, irq, vcpu, false); + vgic_put_irq(kvm, irq); + } + + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) + its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); + + return 0; +} + +/* + * The INVALL command requests flushing of all IRQ data in this collection. + * Find the VCPU mapped to that collection, then iterate over the VM's list + * of mapped LPIs and update the configuration for each IRQ which targets + * the specified vcpu. The configuration will be read from the in-memory + * configuration table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_collection *collection; + struct kvm_vcpu *vcpu; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_INVALL_UNMAPPED_COLLECTION; + + vcpu = collection_to_vcpu(kvm, collection); + vgic_its_invall(vcpu); + + return 0; +} + +/* + * The MOVALL command moves the pending state of all IRQs targeting one + * redistributor to another. We don't hold the pending state in the VCPUs, + * but in the IRQs instead, so there is really not much to do for us here. + * However the spec says that no IRQ must target the old redistributor + * afterwards, so we make sure that no LPI is using the associated target_vcpu. + * This command affects all LPIs in the system that target that redistributor. + */ +static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu1, *vcpu2; + struct vgic_irq *irq; + unsigned long intid; + + /* We advertise GITS_TYPER.PTA==0, making the address the vcpu ID */ + vcpu1 = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd)); + vcpu2 = kvm_get_vcpu_by_id(kvm, its_cmd_mask_field(its_cmd, 3, 16, 32)); + + if (!vcpu1 || !vcpu2) + return E_ITS_MOVALL_PROCNUM_OOR; + + if (vcpu1 == vcpu2) + return 0; + + xa_for_each(&dist->lpi_xa, intid, irq) { + irq = vgic_get_irq(kvm, intid); + if (!irq) + continue; + + update_affinity(irq, vcpu2); + + vgic_put_irq(kvm, irq); + } + + vgic_its_invalidate_cache(its); + + return 0; +} + +/* + * The INT command injects the LPI associated with that DevID/EvID pair. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 msi_data = its_cmd_get_id(its_cmd); + u64 msi_devid = its_cmd_get_deviceid(its_cmd); + + return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); +} + +/* + * This function is called with the its_cmd lock held, but the ITS data + * structure lock dropped. + */ +static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + int ret = -ENODEV; + + mutex_lock(&its->its_lock); + switch (its_cmd_get_command(its_cmd)) { + case GITS_CMD_MAPD: + ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); + break; + case GITS_CMD_MAPC: + ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); + break; + case GITS_CMD_MAPI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); + break; + case GITS_CMD_MAPTI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); + break; + case GITS_CMD_MOVI: + ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); + break; + case GITS_CMD_DISCARD: + ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd); + break; + case GITS_CMD_CLEAR: + ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd); + break; + case GITS_CMD_MOVALL: + ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); + break; + case GITS_CMD_INT: + ret = vgic_its_cmd_handle_int(kvm, its, its_cmd); + break; + case GITS_CMD_INV: + ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); + break; + case GITS_CMD_INVALL: + ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd); + break; + case GITS_CMD_SYNC: + /* we ignore this command: we are in sync all of the time */ + ret = 0; + break; + } + mutex_unlock(&its->its_lock); + + return ret; +} + +static u64 vgic_sanitise_its_baser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK, + GITS_BASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK, + GITS_BASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK, + GITS_BASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* We support only one (ITS) page size: 64K */ + reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; + + return reg; +} + +static u64 vgic_sanitise_its_cbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK, + GITS_CBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK, + GITS_CBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK, + GITS_CBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* Sanitise the physical address to be 64k aligned. */ + reg &= ~GENMASK_ULL(15, 12); + + return reg; +} + +static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cbaser, addr & 7, len); +} + +static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + /* When GITS_CTLR.Enable is 1, this register is RO. */ + if (its->enabled) + return; + + mutex_lock(&its->cmd_lock); + its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val); + its->cbaser = vgic_sanitise_its_cbaser(its->cbaser); + its->creadr = 0; + /* + * CWRITER is architecturally UNKNOWN on reset, but we need to reset + * it to CREADR to make sure we start with an empty command buffer. + */ + its->cwriter = its->creadr; + mutex_unlock(&its->cmd_lock); +} + +#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12) +#define ITS_CMD_SIZE 32 +#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) + +/* Must be called with the cmd_lock held. */ +static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) +{ + gpa_t cbaser; + u64 cmd_buf[4]; + + /* Commands are only processed when the ITS is enabled. */ + if (!its->enabled) + return; + + cbaser = GITS_CBASER_ADDRESS(its->cbaser); + + while (its->cwriter != its->creadr) { + int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, + cmd_buf, ITS_CMD_SIZE); + /* + * If kvm_read_guest() fails, this could be due to the guest + * programming a bogus value in CBASER or something else going + * wrong from which we cannot easily recover. + * According to section 6.3.2 in the GICv3 spec we can just + * ignore that command then. + */ + if (!ret) + vgic_its_handle_command(kvm, its, cmd_buf); + + its->creadr += ITS_CMD_SIZE; + if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) + its->creadr = 0; + } +} + +/* + * By writing to CWRITER the guest announces new commands to be processed. + * To avoid any races in the first place, we take the its_cmd lock, which + * protects our ring buffer variables, so that there is only one user + * per ITS handling commands at a given time. + */ +static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u64 reg; + + if (!its) + return; + + mutex_lock(&its->cmd_lock); + + reg = update_64bit_reg(its->cwriter, addr & 7, len, val); + reg = ITS_CMD_OFFSET(reg); + if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { + mutex_unlock(&its->cmd_lock); + return; + } + its->cwriter = reg; + + vgic_its_process_commands(kvm, its); + + mutex_unlock(&its->cmd_lock); +} + +static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cwriter, addr & 0x7, len); +} + +static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->creadr, addr & 0x7, len); +} + +static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 cmd_offset; + int ret = 0; + + mutex_lock(&its->cmd_lock); + + if (its->enabled) { + ret = -EBUSY; + goto out; + } + + cmd_offset = ITS_CMD_OFFSET(val); + if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { + ret = -EINVAL; + goto out; + } + + its->creadr = cmd_offset; +out: + mutex_unlock(&its->cmd_lock); + return ret; +} + +#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7) +static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u64 reg; + + switch (BASER_INDEX(addr)) { + case 0: + reg = its->baser_device_table; + break; + case 1: + reg = its->baser_coll_table; + break; + default: + reg = 0; + break; + } + + return extract_bytes(reg, addr & 7, len); +} + +#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) +static void vgic_mmio_write_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 entry_size, table_type; + u64 reg, *regptr, clearbits = 0; + + /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ + if (its->enabled) + return; + + switch (BASER_INDEX(addr)) { + case 0: + regptr = &its->baser_device_table; + entry_size = abi->dte_esz; + table_type = GITS_BASER_TYPE_DEVICE; + break; + case 1: + regptr = &its->baser_coll_table; + entry_size = abi->cte_esz; + table_type = GITS_BASER_TYPE_COLLECTION; + clearbits = GITS_BASER_INDIRECT; + break; + default: + return; + } + + reg = update_64bit_reg(*regptr, addr & 7, len, val); + reg &= ~GITS_BASER_RO_MASK; + reg &= ~clearbits; + + reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; + reg |= table_type << GITS_BASER_TYPE_SHIFT; + reg = vgic_sanitise_its_baser(reg); + + *regptr = reg; + + if (!(reg & GITS_BASER_VALID)) { + /* Take the its_lock to prevent a race with a save/restore */ + mutex_lock(&its->its_lock); + switch (table_type) { + case GITS_BASER_TYPE_DEVICE: + vgic_its_free_device_list(kvm, its); + break; + case GITS_BASER_TYPE_COLLECTION: + vgic_its_free_collection_list(kvm, its); + break; + } + mutex_unlock(&its->its_lock); + } +} + +static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u32 reg = 0; + + mutex_lock(&its->cmd_lock); + if (its->creadr == its->cwriter) + reg |= GITS_CTLR_QUIESCENT; + if (its->enabled) + reg |= GITS_CTLR_ENABLE; + mutex_unlock(&its->cmd_lock); + + return reg; +} + +static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + mutex_lock(&its->cmd_lock); + + /* + * It is UNPREDICTABLE to enable the ITS if any of the CBASER or + * device/collection BASER are invalid + */ + if (!its->enabled && (val & GITS_CTLR_ENABLE) && + (!(its->baser_device_table & GITS_BASER_VALID) || + !(its->baser_coll_table & GITS_BASER_VALID) || + !(its->cbaser & GITS_CBASER_VALID))) + goto out; + + its->enabled = !!(val & GITS_CTLR_ENABLE); + if (!its->enabled) + vgic_its_invalidate_cache(its); + + /* + * Try to process any pending commands. This function bails out early + * if the ITS is disabled or no commands have been queued. + */ + vgic_its_process_commands(kvm, its); + +out: + mutex_unlock(&its->cmd_lock); +} + +#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ +{ \ + .reg_offset = off, \ + .len = length, \ + .access_flags = acc, \ + .its_read = rd, \ + .its_write = wr, \ +} + +#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\ +{ \ + .reg_offset = off, \ + .len = length, \ + .access_flags = acc, \ + .its_read = rd, \ + .its_write = wr, \ + .uaccess_its_write = uwr, \ +} + +static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, unsigned long val) +{ + /* Ignore */ +} + +static struct vgic_register_region its_registers[] = { + REGISTER_ITS_DESC(GITS_CTLR, + vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC_UACCESS(GITS_IIDR, + vgic_mmio_read_its_iidr, its_mmio_write_wi, + vgic_mmio_uaccess_write_its_iidr, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_TYPER, + vgic_mmio_read_its_typer, its_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CBASER, + vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CWRITER, + vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC_UACCESS(GITS_CREADR, + vgic_mmio_read_its_creadr, its_mmio_write_wi, + vgic_mmio_uaccess_write_its_creadr, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_BASER, + vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_IDREGS_BASE, + vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30, + VGIC_ACCESS_32bit), +}; + +/* This is called on setting the LPI enable bit in the redistributor. */ +void vgic_enable_lpis(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ)) + its_sync_lpi_pending_table(vcpu); +} + +static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its, + u64 addr) +{ + struct vgic_io_device *iodev = &its->iodev; + int ret; + + mutex_lock(&kvm->slots_lock); + if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { + ret = -EBUSY; + goto out; + } + + its->vgic_its_base = addr; + iodev->regions = its_registers; + iodev->nr_regions = ARRAY_SIZE(its_registers); + kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops); + + iodev->base_addr = its->vgic_its_base; + iodev->iodev_type = IODEV_ITS; + iodev->its = its; + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr, + KVM_VGIC_V3_ITS_SIZE, &iodev->dev); +out: + mutex_unlock(&kvm->slots_lock); + + return ret; +} + +#define INITIAL_BASER_VALUE \ + (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \ + GITS_BASER_PAGE_SIZE_64K) + +#define INITIAL_PROPBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)) + +static int vgic_its_create(struct kvm_device *dev, u32 type) +{ + int ret; + struct vgic_its *its; + + if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) + return -ENODEV; + + its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL_ACCOUNT); + if (!its) + return -ENOMEM; + + mutex_lock(&dev->kvm->arch.config_lock); + + if (vgic_initialized(dev->kvm)) { + ret = vgic_v4_init(dev->kvm); + if (ret < 0) { + mutex_unlock(&dev->kvm->arch.config_lock); + kfree(its); + return ret; + } + } + + mutex_init(&its->its_lock); + mutex_init(&its->cmd_lock); + + /* Yep, even more trickery for lock ordering... */ +#ifdef CONFIG_LOCKDEP + mutex_lock(&its->cmd_lock); + mutex_lock(&its->its_lock); + mutex_unlock(&its->its_lock); + mutex_unlock(&its->cmd_lock); +#endif + + its->vgic_its_base = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&its->device_list); + INIT_LIST_HEAD(&its->collection_list); + xa_init(&its->translation_cache); + + dev->kvm->arch.vgic.msis_require_devid = true; + dev->kvm->arch.vgic.has_its = true; + its->enabled = false; + its->dev = dev; + + its->baser_device_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); + its->baser_coll_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); + dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; + + dev->private = its; + + ret = vgic_its_set_abi(its, NR_ITS_ABIS - 1); + + mutex_unlock(&dev->kvm->arch.config_lock); + + return ret; +} + +static void vgic_its_destroy(struct kvm_device *kvm_dev) +{ + struct kvm *kvm = kvm_dev->kvm; + struct vgic_its *its = kvm_dev->private; + + mutex_lock(&its->its_lock); + + vgic_its_debug_destroy(kvm_dev); + + vgic_its_free_device_list(kvm, its); + vgic_its_free_collection_list(kvm, its); + vgic_its_invalidate_cache(its); + xa_destroy(&its->translation_cache); + + mutex_unlock(&its->its_lock); + kfree(its); + kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */ +} + +static int vgic_its_has_attr_regs(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + gpa_t offset = attr->attr; + int align; + + align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7; + + if (offset & align) + return -EINVAL; + + region = vgic_find_mmio_region(its_registers, + ARRAY_SIZE(its_registers), + offset); + if (!region) + return -ENXIO; + + return 0; +} + +static int vgic_its_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u64 *reg, bool is_write) +{ + const struct vgic_register_region *region; + struct vgic_its *its; + gpa_t addr, offset; + unsigned int len; + int align, ret = 0; + + its = dev->private; + offset = attr->attr; + + /* + * Although the spec supports upper/lower 32-bit accesses to + * 64-bit ITS registers, the userspace ABI requires 64-bit + * accesses to all 64-bit wide registers. We therefore only + * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID + * registers + */ + if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4)) + align = 0x3; + else + align = 0x7; + + if (offset & align) + return -EINVAL; + + mutex_lock(&dev->kvm->lock); + + if (kvm_trylock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + + mutex_lock(&dev->kvm->arch.config_lock); + + if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { + ret = -ENXIO; + goto out; + } + + region = vgic_find_mmio_region(its_registers, + ARRAY_SIZE(its_registers), + offset); + if (!region) { + ret = -ENXIO; + goto out; + } + + addr = its->vgic_its_base + offset; + + len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4; + + if (is_write) { + if (region->uaccess_its_write) + ret = region->uaccess_its_write(dev->kvm, its, addr, + len, *reg); + else + region->its_write(dev->kvm, its, addr, len, *reg); + } else { + *reg = region->its_read(dev->kvm, its, addr, len); + } +out: + mutex_unlock(&dev->kvm->arch.config_lock); + kvm_unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return ret; +} + +static u32 compute_next_devid_offset(struct list_head *h, + struct its_device *dev) +{ + struct its_device *next; + u32 next_offset; + + if (list_is_last(&dev->dev_list, h)) + return 0; + next = list_next_entry(dev, dev_list); + next_offset = next->device_id - dev->device_id; + + return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET); +} + +static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite) +{ + struct its_ite *next; + u32 next_offset; + + if (list_is_last(&ite->ite_list, h)) + return 0; + next = list_next_entry(ite, ite_list); + next_offset = next->event_id - ite->event_id; + + return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET); +} + +/** + * typedef entry_fn_t - Callback called on a table entry restore path + * @its: its handle + * @id: id of the entry + * @entry: pointer to the entry + * @opaque: pointer to an opaque data + * + * Return: < 0 on error, 0 if last element was identified, id offset to next + * element otherwise + */ +typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, + void *opaque); + +/** + * scan_its_table - Scan a contiguous table in guest RAM and applies a function + * to each entry + * + * @its: its handle + * @base: base gpa of the table + * @size: size of the table in bytes + * @esz: entry size in bytes + * @start_id: the ID of the first entry in the table + * (non zero for 2d level tables) + * @fn: function to apply on each entry + * @opaque: pointer to opaque data + * + * Return: < 0 on error, 0 if last element was identified, 1 otherwise + * (the last element may not be found on second level tables) + */ +static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, + int start_id, entry_fn_t fn, void *opaque) +{ + struct kvm *kvm = its->dev->kvm; + unsigned long len = size; + int id = start_id; + gpa_t gpa = base; + char entry[ESZ_MAX]; + int ret; + + memset(entry, 0, esz); + + while (true) { + int next_offset; + size_t byte_offset; + + ret = kvm_read_guest_lock(kvm, gpa, entry, esz); + if (ret) + return ret; + + next_offset = fn(its, id, entry, opaque); + if (next_offset <= 0) + return next_offset; + + byte_offset = next_offset * esz; + if (byte_offset >= len) + break; + + id += next_offset; + gpa += byte_offset; + len -= byte_offset; + } + return 1; +} + +/* + * vgic_its_save_ite - Save an interrupt translation entry at @gpa + */ +static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, + struct its_ite *ite, gpa_t gpa) +{ + u32 next_offset; + u64 val; + + next_offset = compute_next_eventid_offset(&dev->itt_head, ite); + val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) | + ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | + ite->collection->collection_id; + val = cpu_to_le64(val); + + return vgic_its_write_entry_lock(its, gpa, val, ite); +} + +/** + * vgic_its_restore_ite - restore an interrupt translation entry + * + * @its: its handle + * @event_id: id used for indexing + * @ptr: pointer to the ITE entry + * @opaque: pointer to the its_device + */ +static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, + void *ptr, void *opaque) +{ + struct its_device *dev = opaque; + struct its_collection *collection; + struct kvm *kvm = its->dev->kvm; + struct kvm_vcpu *vcpu = NULL; + u64 val; + u64 *p = (u64 *)ptr; + struct vgic_irq *irq; + u32 coll_id, lpi_id; + struct its_ite *ite; + u32 offset; + + val = *p; + + val = le64_to_cpu(val); + + coll_id = val & KVM_ITS_ITE_ICID_MASK; + lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT; + + if (!lpi_id) + return 1; /* invalid entry, no choice but to scan next entry */ + + if (lpi_id < VGIC_MIN_LPI) + return -EINVAL; + + offset = val >> KVM_ITS_ITE_NEXT_SHIFT; + if (event_id + offset >= BIT_ULL(dev->num_eventid_bits)) + return -EINVAL; + + collection = find_collection(its, coll_id); + if (!collection) + return -EINVAL; + + if (!vgic_its_check_event_id(its, dev, event_id)) + return -EINVAL; + + ite = vgic_its_alloc_ite(dev, collection, event_id); + if (IS_ERR(ite)) + return PTR_ERR(ite); + + if (its_is_collection_mapped(collection)) + vcpu = kvm_get_vcpu_by_id(kvm, collection->target_addr); + + irq = vgic_add_lpi(kvm, lpi_id, vcpu); + if (IS_ERR(irq)) { + its_free_ite(kvm, ite); + return PTR_ERR(irq); + } + ite->irq = irq; + + return offset; +} + +static int vgic_its_ite_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct its_ite *itea = container_of(a, struct its_ite, ite_list); + struct its_ite *iteb = container_of(b, struct its_ite, ite_list); + + if (itea->event_id < iteb->event_id) + return -1; + else + return 1; +} + +static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + gpa_t base = device->itt_addr; + struct its_ite *ite; + int ret; + int ite_esz = abi->ite_esz; + + list_sort(NULL, &device->itt_head, vgic_its_ite_cmp); + + list_for_each_entry(ite, &device->itt_head, ite_list) { + gpa_t gpa = base + ite->event_id * ite_esz; + + /* + * If an LPI carries the HW bit, this means that this + * interrupt is controlled by GICv4, and we do not + * have direct access to that state without GICv4.1. + * Let's simply fail the save operation... + */ + if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1) + return -EACCES; + + ret = vgic_its_save_ite(its, device, ite, gpa); + if (ret) + return ret; + } + return 0; +} + +/** + * vgic_its_restore_itt - restore the ITT of a device + * + * @its: its handle + * @dev: device handle + * + * Return 0 on success, < 0 on error + */ +static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + gpa_t base = dev->itt_addr; + int ret; + int ite_esz = abi->ite_esz; + size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz; + + ret = scan_its_table(its, base, max_size, ite_esz, 0, + vgic_its_restore_ite, dev); + + /* scan_its_table returns +1 if all ITEs are invalid */ + if (ret > 0) + ret = 0; + + return ret; +} + +/** + * vgic_its_save_dte - Save a device table entry at a given GPA + * + * @its: ITS handle + * @dev: ITS device + * @ptr: GPA + */ +static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, + gpa_t ptr) +{ + u64 val, itt_addr_field; + u32 next_offset; + + itt_addr_field = dev->itt_addr >> 8; + next_offset = compute_next_devid_offset(&its->device_list, dev); + val = (1ULL << KVM_ITS_DTE_VALID_SHIFT | + ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) | + (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | + (dev->num_eventid_bits - 1)); + val = cpu_to_le64(val); + + return vgic_its_write_entry_lock(its, ptr, val, dte); +} + +/** + * vgic_its_restore_dte - restore a device table entry + * + * @its: its handle + * @id: device id the DTE corresponds to + * @ptr: kernel VA where the 8 byte DTE is located + * @opaque: unused + * + * Return: < 0 on error, 0 if the dte is the last one, id offset to the + * next dte otherwise + */ +static int vgic_its_restore_dte(struct vgic_its *its, u32 id, + void *ptr, void *opaque) +{ + struct its_device *dev; + u64 baser = its->baser_device_table; + gpa_t itt_addr; + u8 num_eventid_bits; + u64 entry = *(u64 *)ptr; + bool valid; + u32 offset; + int ret; + + entry = le64_to_cpu(entry); + + valid = entry >> KVM_ITS_DTE_VALID_SHIFT; + num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1; + itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK) + >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8; + + if (!valid) + return 1; + + /* dte entry is valid */ + offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT; + + if (!vgic_its_check_id(its, baser, id, NULL)) + return -EINVAL; + + dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + ret = vgic_its_restore_itt(its, dev); + if (ret) { + vgic_its_free_device(its->dev->kvm, its, dev); + return ret; + } + + return offset; +} + +static int vgic_its_device_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct its_device *deva = container_of(a, struct its_device, dev_list); + struct its_device *devb = container_of(b, struct its_device, dev_list); + + if (deva->device_id < devb->device_id) + return -1; + else + return 1; +} + +/* + * vgic_its_save_device_tables - Save the device table and all ITT + * into guest RAM + * + * L1/L2 handling is hidden by vgic_its_check_id() helper which directly + * returns the GPA of the device entry + */ +static int vgic_its_save_device_tables(struct vgic_its *its) +{ + u64 baser = its->baser_device_table; + struct its_device *dev; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + list_sort(NULL, &its->device_list, vgic_its_device_cmp); + + list_for_each_entry(dev, &its->device_list, dev_list) { + int ret; + gpa_t eaddr; + + if (!vgic_its_check_id(its, baser, + dev->device_id, &eaddr)) + return -EINVAL; + + ret = vgic_its_save_itt(its, dev); + if (ret) + return ret; + + ret = vgic_its_save_dte(its, dev, eaddr); + if (ret) + return ret; + } + return 0; +} + +/** + * handle_l1_dte - callback used for L1 device table entries (2 stage case) + * + * @its: its handle + * @id: index of the entry in the L1 table + * @addr: kernel VA + * @opaque: unused + * + * L1 table entries are scanned by steps of 1 entry + * Return < 0 if error, 0 if last dte was found when scanning the L2 + * table, +1 otherwise (meaning next L1 entry must be scanned) + */ +static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, + void *opaque) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int l2_start_id = id * (SZ_64K / abi->dte_esz); + u64 entry = *(u64 *)addr; + int dte_esz = abi->dte_esz; + gpa_t gpa; + int ret; + + entry = le64_to_cpu(entry); + + if (!(entry & KVM_ITS_L1E_VALID_MASK)) + return 1; + + gpa = entry & KVM_ITS_L1E_ADDR_MASK; + + ret = scan_its_table(its, gpa, SZ_64K, dte_esz, + l2_start_id, vgic_its_restore_dte, NULL); + + return ret; +} + +/* + * vgic_its_restore_device_tables - Restore the device table and all ITT + * from guest RAM to internal data structs + */ +static int vgic_its_restore_device_tables(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_device_table; + int l1_esz, ret; + int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + gpa_t l1_gpa; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + l1_gpa = GITS_BASER_ADDR_48_to_52(baser); + + if (baser & GITS_BASER_INDIRECT) { + l1_esz = GITS_LVL1_ENTRY_SIZE; + ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, + handle_l1_dte, NULL); + } else { + l1_esz = abi->dte_esz; + ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, + vgic_its_restore_dte, NULL); + } + + /* scan_its_table returns +1 if all entries are invalid */ + if (ret > 0) + ret = 0; + + if (ret < 0) + vgic_its_free_device_list(its->dev->kvm, its); + + return ret; +} + +static int vgic_its_save_cte(struct vgic_its *its, + struct its_collection *collection, + gpa_t gpa) +{ + u64 val; + + val = (1ULL << KVM_ITS_CTE_VALID_SHIFT | + ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | + collection->collection_id); + val = cpu_to_le64(val); + + return vgic_its_write_entry_lock(its, gpa, val, cte); +} + +/* + * Restore a collection entry into the ITS collection table. + * Return +1 on success, 0 if the entry was invalid (which should be + * interpreted as end-of-table), and a negative error value for generic errors. + */ +static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa) +{ + struct its_collection *collection; + struct kvm *kvm = its->dev->kvm; + u32 target_addr, coll_id; + u64 val; + int ret; + + ret = vgic_its_read_entry_lock(its, gpa, &val, cte); + if (ret) + return ret; + val = le64_to_cpu(val); + if (!(val & KVM_ITS_CTE_VALID_MASK)) + return 0; + + target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT); + coll_id = val & KVM_ITS_CTE_ICID_MASK; + + if (target_addr != COLLECTION_NOT_MAPPED && + !kvm_get_vcpu_by_id(kvm, target_addr)) + return -EINVAL; + + collection = find_collection(its, coll_id); + if (collection) + return -EEXIST; + + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) + return -EINVAL; + + ret = vgic_its_alloc_collection(its, &collection, coll_id); + if (ret) + return ret; + collection->target_addr = target_addr; + return 1; +} + +/* + * vgic_its_save_collection_table - Save the collection table into + * guest RAM + */ +static int vgic_its_save_collection_table(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_coll_table; + gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); + struct its_collection *collection; + size_t max_size, filled = 0; + int ret, cte_esz = abi->cte_esz; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + + list_for_each_entry(collection, &its->collection_list, coll_list) { + ret = vgic_its_save_cte(its, collection, gpa); + if (ret) + return ret; + gpa += cte_esz; + filled += cte_esz; + } + + if (filled == max_size) + return 0; + + /* + * table is not fully filled, add a last dummy element + * with valid bit unset + */ + return vgic_its_write_entry_lock(its, gpa, 0ULL, cte); +} + +/* + * vgic_its_restore_collection_table - reads the collection table + * in guest memory and restores the ITS internal state. Requires the + * BASER registers to be restored before. + */ +static int vgic_its_restore_collection_table(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_coll_table; + int cte_esz = abi->cte_esz; + size_t max_size, read = 0; + gpa_t gpa; + int ret; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + gpa = GITS_BASER_ADDR_48_to_52(baser); + + max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + + while (read < max_size) { + ret = vgic_its_restore_cte(its, gpa); + if (ret <= 0) + break; + gpa += cte_esz; + read += cte_esz; + } + + if (ret > 0) + return 0; + + if (ret < 0) + vgic_its_free_collection_list(its->dev->kvm, its); + + return ret; +} + +/* + * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM + * according to v0 ABI + */ +static int vgic_its_save_tables_v0(struct vgic_its *its) +{ + int ret; + + ret = vgic_its_save_device_tables(its); + if (ret) + return ret; + + return vgic_its_save_collection_table(its); +} + +/* + * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM + * to internal data structs according to V0 ABI + * + */ +static int vgic_its_restore_tables_v0(struct vgic_its *its) +{ + int ret; + + ret = vgic_its_restore_collection_table(its); + if (ret) + return ret; + + ret = vgic_its_restore_device_tables(its); + if (ret) + vgic_its_free_collection_list(its->dev->kvm, its); + return ret; +} + +static int vgic_its_commit_v0(struct vgic_its *its) +{ + const struct vgic_its_abi *abi; + + abi = vgic_its_get_abi(its); + its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK; + its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK; + + its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5) + << GITS_BASER_ENTRY_SIZE_SHIFT); + + its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5) + << GITS_BASER_ENTRY_SIZE_SHIFT); + return 0; +} + +static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its) +{ + /* We need to keep the ABI specific field values */ + its->baser_coll_table &= ~GITS_BASER_VALID; + its->baser_device_table &= ~GITS_BASER_VALID; + its->cbaser = 0; + its->creadr = 0; + its->cwriter = 0; + its->enabled = 0; + vgic_its_free_device_list(kvm, its); + vgic_its_free_collection_list(kvm, its); +} + +static int vgic_its_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_ITS_ADDR_TYPE: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + case KVM_DEV_ARM_ITS_CTRL_RESET: + return 0; + case KVM_DEV_ARM_ITS_SAVE_TABLES: + return 0; + case KVM_DEV_ARM_ITS_RESTORE_TABLES: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: + return vgic_its_has_attr_regs(dev, attr); + } + return -ENXIO; +} + +static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int ret = 0; + + if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ + return 0; + + mutex_lock(&kvm->lock); + + if (kvm_trylock_all_vcpus(kvm)) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + + mutex_lock(&kvm->arch.config_lock); + mutex_lock(&its->its_lock); + + switch (attr) { + case KVM_DEV_ARM_ITS_CTRL_RESET: + vgic_its_reset(kvm, its); + break; + case KVM_DEV_ARM_ITS_SAVE_TABLES: + ret = abi->save_tables(its); + break; + case KVM_DEV_ARM_ITS_RESTORE_TABLES: + ret = abi->restore_tables(its); + break; + default: + ret = -ENXIO; + break; + } + + mutex_unlock(&its->its_lock); + mutex_unlock(&kvm->arch.config_lock); + kvm_unlock_all_vcpus(kvm); + mutex_unlock(&kvm->lock); + return ret; +} + +/* + * kvm_arch_allow_write_without_running_vcpu - allow writing guest memory + * without the running VCPU when dirty ring is enabled. + * + * The running VCPU is required to track dirty guest pages when dirty ring + * is enabled. Otherwise, the backup bitmap should be used to track the + * dirty guest pages. When vgic/its tables are being saved, the backup + * bitmap is used to track the dirty guest pages due to the missed running + * VCPU in the period. + */ +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + return dist->table_write_in_progress; +} + +static int vgic_its_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct vgic_its *its = dev->private; + int ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + u64 addr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + ret = vgic_check_iorange(dev->kvm, its->vgic_its_base, + addr, SZ_64K, KVM_VGIC_V3_ITS_SIZE); + if (ret) + return ret; + + ret = vgic_register_its_iodev(dev->kvm, its, addr); + if (ret) + return ret; + + return vgic_its_debug_init(dev); + + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + return vgic_its_ctrl(dev->kvm, its, attr->attr); + case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_its_attr_regs_access(dev, attr, ®, true); + } + } + return -ENXIO; +} + +static int vgic_its_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + struct vgic_its *its = dev->private; + u64 addr = its->vgic_its_base; + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + break; + } + case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + int ret; + + ret = vgic_its_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + return put_user(reg, uaddr); + } + default: + return -ENXIO; + } + + return 0; +} + +static struct kvm_device_ops kvm_arm_vgic_its_ops = { + .name = "kvm-arm-vgic-its", + .create = vgic_its_create, + .destroy = vgic_its_destroy, + .set_attr = vgic_its_set_attr, + .get_attr = vgic_its_get_attr, + .has_attr = vgic_its_has_attr, +}; + +int kvm_vgic_register_its_device(void) +{ + return kvm_register_device_ops(&kvm_arm_vgic_its_ops, + KVM_DEV_TYPE_ARM_VGIC_ITS); +} diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c new file mode 100644 index 000000000000..3d1a776b716d --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGIC: KVM DEVICE API + * + * Copyright (C) 2015 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <linux/uaccess.h> +#include <asm/kvm_mmu.h> +#include <asm/cputype.h> +#include "vgic.h" + +/* common helpers */ + +int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, + phys_addr_t addr, phys_addr_t alignment, + phys_addr_t size) +{ + if (!IS_VGIC_ADDR_UNDEF(ioaddr)) + return -EEXIST; + + if (!IS_ALIGNED(addr, alignment) || !IS_ALIGNED(size, alignment)) + return -EINVAL; + + if (addr + size < addr) + return -EINVAL; + + if (addr & ~kvm_phys_mask(&kvm->arch.mmu) || + (addr + size) > kvm_phys_size(&kvm->arch.mmu)) + return -E2BIG; + + return 0; +} + +static int vgic_check_type(struct kvm *kvm, int type_needed) +{ + if (kvm->arch.vgic.vgic_model != type_needed) + return -ENODEV; + else + return 0; +} + +int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) +{ + struct vgic_dist *vgic = &kvm->arch.vgic; + int r; + + mutex_lock(&kvm->arch.config_lock); + switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + if (!r) + r = vgic_check_iorange(kvm, vgic->vgic_dist_base, dev_addr->addr, + SZ_4K, KVM_VGIC_V2_DIST_SIZE); + if (!r) + vgic->vgic_dist_base = dev_addr->addr; + break; + case KVM_VGIC_V2_ADDR_TYPE_CPU: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + if (!r) + r = vgic_check_iorange(kvm, vgic->vgic_cpu_base, dev_addr->addr, + SZ_4K, KVM_VGIC_V2_CPU_SIZE); + if (!r) + vgic->vgic_cpu_base = dev_addr->addr; + break; + default: + r = -ENODEV; + } + + mutex_unlock(&kvm->arch.config_lock); + + return r; +} + +/** + * kvm_vgic_addr - set or get vgic VM base addresses + * @kvm: pointer to the vm struct + * @attr: pointer to the attribute being retrieved/updated + * @write: if true set the address in the VM address space, if false read the + * address + * + * Set or get the vgic base addresses for the distributor and the virtual CPU + * interface in the VM physical address space. These addresses are properties + * of the emulated core/SoC and therefore user space initially knows this + * information. + * Check them for sanity (alignment, double assignment). We can't check for + * overlapping regions in case of a virtual GICv3 here, since we don't know + * the number of VCPUs yet, so we defer this check to map_resources(). + */ +static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool write) +{ + u64 __user *uaddr = (u64 __user *)attr->addr; + struct vgic_dist *vgic = &kvm->arch.vgic; + phys_addr_t *addr_ptr, alignment, size; + u64 undef_value = VGIC_ADDR_UNDEF; + u64 addr; + int r; + + /* Reading a redistributor region addr implies getting the index */ + if (write || attr->attr == KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION) + if (get_user(addr, uaddr)) + return -EFAULT; + + /* + * Since we can't hold config_lock while registering the redistributor + * iodevs, take the slots_lock immediately. + */ + mutex_lock(&kvm->slots_lock); + switch (attr->attr) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + addr_ptr = &vgic->vgic_dist_base; + alignment = SZ_4K; + size = KVM_VGIC_V2_DIST_SIZE; + break; + case KVM_VGIC_V2_ADDR_TYPE_CPU: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + addr_ptr = &vgic->vgic_cpu_base; + alignment = SZ_4K; + size = KVM_VGIC_V2_CPU_SIZE; + break; + case KVM_VGIC_V3_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); + addr_ptr = &vgic->vgic_dist_base; + alignment = SZ_64K; + size = KVM_VGIC_V3_DIST_SIZE; + break; + case KVM_VGIC_V3_ADDR_TYPE_REDIST: { + struct vgic_redist_region *rdreg; + + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); + if (r) + break; + if (write) { + r = vgic_v3_set_redist_base(kvm, 0, addr, 0); + goto out; + } + rdreg = list_first_entry_or_null(&vgic->rd_regions, + struct vgic_redist_region, list); + if (!rdreg) + addr_ptr = &undef_value; + else + addr_ptr = &rdreg->base; + break; + } + case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: + { + struct vgic_redist_region *rdreg; + u8 index; + + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); + if (r) + break; + + index = addr & KVM_VGIC_V3_RDIST_INDEX_MASK; + + if (write) { + gpa_t base = addr & KVM_VGIC_V3_RDIST_BASE_MASK; + u32 count = FIELD_GET(KVM_VGIC_V3_RDIST_COUNT_MASK, addr); + u8 flags = FIELD_GET(KVM_VGIC_V3_RDIST_FLAGS_MASK, addr); + + if (!count || flags) + r = -EINVAL; + else + r = vgic_v3_set_redist_base(kvm, index, + base, count); + goto out; + } + + rdreg = vgic_v3_rdist_region_from_index(kvm, index); + if (!rdreg) { + r = -ENOENT; + goto out; + } + + addr = index; + addr |= rdreg->base; + addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; + goto out; + } + default: + r = -ENODEV; + } + + if (r) + goto out; + + mutex_lock(&kvm->arch.config_lock); + if (write) { + r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size); + if (!r) + *addr_ptr = addr; + } else { + addr = *addr_ptr; + } + mutex_unlock(&kvm->arch.config_lock); + +out: + mutex_unlock(&kvm->slots_lock); + + if (!r && !write) + r = put_user(addr, uaddr); + + return r; +} + +static int vgic_set_common_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int r; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + r = kvm_vgic_addr(dev->kvm, attr, true); + return (r == -ENODEV) ? -ENXIO : r; + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 val; + int ret = 0; + + if (get_user(val, uaddr)) + return -EFAULT; + + /* + * We require: + * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs + * - at most 1024 interrupts + * - a multiple of 32 interrupts + */ + if (val < (VGIC_NR_PRIVATE_IRQS + 32) || + val > VGIC_MAX_RESERVED || + (val & 31)) + return -EINVAL; + + mutex_lock(&dev->kvm->arch.config_lock); + + /* + * Either userspace has already configured NR_IRQS or + * the vgic has already been initialized and vgic_init() + * supplied a default amount of SPIs. + */ + if (dev->kvm->arch.vgic.nr_spis) + ret = -EBUSY; + else + dev->kvm->arch.vgic.nr_spis = + val - VGIC_NR_PRIVATE_IRQS; + + mutex_unlock(&dev->kvm->arch.config_lock); + + return ret; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: { + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + mutex_lock(&dev->kvm->arch.config_lock); + r = vgic_init(dev->kvm); + mutex_unlock(&dev->kvm->arch.config_lock); + return r; + case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: + /* + * OK, this one isn't common at all, but we + * want to handle all control group attributes + * in a single place. + */ + if (vgic_check_type(dev->kvm, KVM_DEV_TYPE_ARM_VGIC_V3)) + return -ENXIO; + mutex_lock(&dev->kvm->lock); + + if (kvm_trylock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + + mutex_lock(&dev->kvm->arch.config_lock); + r = vgic_v3_save_pending_tables(dev->kvm); + mutex_unlock(&dev->kvm->arch.config_lock); + kvm_unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return r; + } + break; + } + } + + return -ENXIO; +} + +static int vgic_get_common_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int r = -ENXIO; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + r = kvm_vgic_addr(dev->kvm, attr, false); + return (r == -ENODEV) ? -ENXIO : r; + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + + r = put_user(dev->kvm->arch.vgic.nr_spis + + VGIC_NR_PRIVATE_IRQS, uaddr); + break; + } + } + + return r; +} + +static int vgic_create(struct kvm_device *dev, u32 type) +{ + return kvm_vgic_create(dev->kvm, type); +} + +static void vgic_destroy(struct kvm_device *dev) +{ + kfree(dev); +} + +int kvm_register_vgic_device(unsigned long type) +{ + int ret = -ENODEV; + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V2: + ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, + KVM_DEV_TYPE_ARM_VGIC_V2); + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: + ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, + KVM_DEV_TYPE_ARM_VGIC_V3); + + if (ret) + break; + ret = kvm_vgic_register_its_device(); + break; + } + + return ret; +} + +int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr) +{ + int cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr); + + reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid); + if (!reg_attr->vcpu) + return -EINVAL; + + return 0; +} + +/** + * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state + * + * @dev: kvm device handle + * @attr: kvm device attribute + * @is_write: true if userspace is writing a register + */ +static int vgic_v2_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + bool is_write) +{ + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; + struct vgic_reg_attr reg_attr; + gpa_t addr; + struct kvm_vcpu *vcpu; + int ret; + u32 val; + + ret = vgic_v2_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + if (is_write) + if (get_user(val, uaddr)) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + + if (kvm_trylock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + + mutex_lock(&dev->kvm->arch.config_lock); + + ret = vgic_init(dev->kvm); + if (ret) + goto out; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val); + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, &val); + break; + default: + ret = -EINVAL; + break; + } + +out: + mutex_unlock(&dev->kvm->arch.config_lock); + kvm_unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->lock); + + if (!ret && !is_write) + ret = put_user(val, uaddr); + + return ret; +} + +static int vgic_v2_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_attr_regs_access(dev, attr, true); + default: + return vgic_set_common_attr(dev, attr); + } +} + +static int vgic_v2_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_attr_regs_access(dev, attr, false); + default: + return vgic_get_common_attr(dev, attr); + } +} + +static int vgic_v2_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + case KVM_VGIC_V2_ADDR_TYPE_CPU: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_has_attr_regs(dev, attr); + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return 0; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + } + } + return -ENXIO; +} + +struct kvm_device_ops kvm_arm_vgic_v2_ops = { + .name = "kvm-arm-vgic-v2", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_v2_set_attr, + .get_attr = vgic_v2_get_attr, + .has_attr = vgic_v2_has_attr, +}; + +int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr) +{ + unsigned long vgic_mpidr, mpidr_reg; + + /* + * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group, + * attr might not hold MPIDR. Hence assume vcpu0. + */ + if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) { + vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >> + KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT; + + mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr); + reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg); + } else { + reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0); + } + + if (!reg_attr->vcpu) + return -EINVAL; + + reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + + return 0; +} + +/* + * Allow access to certain ID-like registers prior to VGIC initialization, + * thereby allowing the VMM to provision the features / sizing of the VGIC. + */ +static bool reg_allowed_pre_init(struct kvm_device_attr *attr) +{ + if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) + return false; + + switch (attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK) { + case GICD_IIDR: + case GICD_TYPER2: + return true; + default: + return false; + } +} + +/* + * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state + * + * @dev: kvm device handle + * @attr: kvm device attribute + * @is_write: true if userspace is writing a register + */ +static int vgic_v3_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + bool is_write) +{ + struct vgic_reg_attr reg_attr; + gpa_t addr; + struct kvm_vcpu *vcpu; + bool uaccess; + u32 val; + int ret; + + ret = vgic_v3_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + /* Sysregs uaccess is performed by the sysreg handling code */ + uaccess = false; + break; + default: + uaccess = true; + } + + if (uaccess && is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; + if (get_user(val, uaddr)) + return -EFAULT; + } + + mutex_lock(&dev->kvm->lock); + + if (kvm_trylock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + + mutex_lock(&dev->kvm->arch.config_lock); + + if (!(vgic_initialized(dev->kvm) || reg_allowed_pre_init(attr))) { + ret = -EBUSY; + goto out; + } + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &val); + break; + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &val); + break; + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + ret = vgic_v3_cpu_sysregs_uaccess(vcpu, attr, is_write); + break; + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + unsigned int info, intid; + + info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> + KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; + if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { + intid = attr->attr & + KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; + ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, + intid, &val); + } else { + ret = -EINVAL; + } + break; + } + default: + ret = -EINVAL; + break; + } + +out: + mutex_unlock(&dev->kvm->arch.config_lock); + kvm_unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->lock); + + if (!ret && uaccess && !is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; + ret = put_user(val, uaddr); + } + + return ret; +} + +static int vgic_v3_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + return vgic_v3_attr_regs_access(dev, attr, true); + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { + u32 __user *uaddr = (u32 __user *)attr->addr; + u32 val; + + if (get_user(val, uaddr)) + return -EFAULT; + + guard(mutex)(&dev->kvm->arch.config_lock); + if (vgic_initialized(dev->kvm)) + return -EBUSY; + + if (!irq_is_ppi(val)) + return -EINVAL; + + dev->kvm->arch.vgic.mi_intid = val; + return 0; + } + default: + return vgic_set_common_attr(dev, attr); + } +} + +static int vgic_v3_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + return vgic_v3_attr_regs_access(dev, attr, false); + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + + guard(mutex)(&dev->kvm->arch.config_lock); + return put_user(dev->kvm->arch.vgic.mi_intid, uaddr); + } + default: + return vgic_get_common_attr(dev, attr); + } +} + +static int vgic_v3_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_V3_ADDR_TYPE_DIST: + case KVM_VGIC_V3_ADDR_TYPE_REDIST: + case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + return vgic_v3_has_attr_regs(dev, attr); + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: + return 0; + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> + KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) == + VGIC_LEVEL_INFO_LINE_LEVEL) + return 0; + break; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: + return 0; + } + } + return -ENXIO; +} + +struct kvm_device_ops kvm_arm_vgic_v3_ops = { + .name = "kvm-arm-vgic-v3", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_v3_set_attr, + .get_attr = vgic_v3_get_attr, + .has_attr = vgic_v3_has_attr, +}; diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c new file mode 100644 index 000000000000..406845b3117c --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -0,0 +1,585 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGICv2 MMIO handling functions + */ + +#include <linux/irqchip/arm-gic.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/nospec.h> + +#include <kvm/iodev.h> +#include <kvm/arm_vgic.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +/* + * The Revision field in the IIDR have the following meanings: + * + * Revision 1: Report GICv2 interrupts as group 0 instead of group 1 + * Revision 2: Interrupt groups are guest-configurable and signaled using + * their configured groups. + */ + +static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + u32 value; + + switch (addr & 0x0c) { + case GIC_DIST_CTRL: + value = vgic->enabled ? GICD_ENABLE : 0; + break; + case GIC_DIST_CTR: + value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; + value = (value >> 5) - 1; + value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; + break; + case GIC_DIST_IIDR: + value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | + (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | + (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); + break; + default: + return 0; + } + + return value; +} + +static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + bool was_enabled = dist->enabled; + + switch (addr & 0x0c) { + case GIC_DIST_CTRL: + dist->enabled = val & GICD_ENABLE; + if (!was_enabled && dist->enabled) + vgic_kick_vcpus(vcpu->kvm); + break; + case GIC_DIST_CTR: + case GIC_DIST_IIDR: + /* Nothing to do */ + return; + } +} + +static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u32 reg; + + switch (addr & 0x0c) { + case GIC_DIST_IIDR: + reg = vgic_mmio_read_v2_misc(vcpu, addr, len); + if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK) + return -EINVAL; + + /* + * If we observe a write to GICD_IIDR we know that userspace + * has been updated and has had a chance to cope with older + * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting + * interrupts as group 1, and therefore we now allow groups to + * be user writable. Doing this by default would break + * migration from old kernels to new kernels with legacy + * userspace. + */ + reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg); + switch (reg) { + case KVM_VGIC_IMP_REV_2: + case KVM_VGIC_IMP_REV_3: + vcpu->kvm->arch.vgic.v2_groups_user_writable = true; + dist->implementation_rev = reg; + return 0; + default: + return -EINVAL; + } + } + + vgic_mmio_write_v2_misc(vcpu, addr, len, val); + return 0; +} + +static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + if (vcpu->kvm->arch.vgic.v2_groups_user_writable) + vgic_mmio_write_group(vcpu, addr, len, val); + + return 0; +} + +static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus); + int intid = val & 0xf; + int targets = (val >> 16) & 0xff; + int mode = (val >> 24) & 0x03; + struct kvm_vcpu *vcpu; + unsigned long flags, c; + + switch (mode) { + case 0x0: /* as specified by targets */ + break; + case 0x1: + targets = (1U << nr_vcpus) - 1; /* all, ... */ + targets &= ~(1U << source_vcpu->vcpu_id); /* but self */ + break; + case 0x2: /* this very vCPU only */ + targets = (1U << source_vcpu->vcpu_id); + break; + case 0x3: /* reserved */ + return; + } + + kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) { + struct vgic_irq *irq; + + if (!(targets & (1U << c))) + continue; + + irq = vgic_get_vcpu_irq(vcpu, intid); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + irq->source |= 1U << source_vcpu->vcpu_id; + + vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags); + vgic_put_irq(source_vcpu->kvm, irq); + } +} + +static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + val |= (u64)irq->targets << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0); + int i; + unsigned long flags; + + /* GICD_ITARGETSR[0-7] are read-only */ + if (intid < VGIC_NR_PRIVATE_IRQS) + return; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid + i); + int target; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + irq->targets = (val >> (i * 8)) & cpu_mask; + target = irq->targets ? __ffs(irq->targets) : 0; + irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = addr & 0x0f; + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + val |= (u64)irq->source << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); + } + return val; +} + +static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = addr & 0x0f; + int i; + unsigned long flags; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + irq->source &= ~((val >> (i * 8)) & 0xff); + if (!irq->source) + irq->pending_latch = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = addr & 0x0f; + int i; + unsigned long flags; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + irq->source |= (val >> (i * 8)) & 0xff; + + if (irq->source) { + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + vgic_put_irq(vcpu->kvm, irq); + } +} + +#define GICC_ARCH_VERSION_V2 0x2 + +/* These are for userland accesses only, there is no guest-facing emulation. */ +static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_vmcr vmcr; + u32 val; + + vgic_get_vmcr(vcpu, &vmcr); + + switch (addr & 0xff) { + case GIC_CPU_CTRL: + val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT; + val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT; + val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT; + val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT; + val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT; + val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT; + + break; + case GIC_CPU_PRIMASK: + /* + * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the + * PMR field as GICH_VMCR.VMPriMask rather than + * GICC_PMR.Priority, so we expose the upper five bits of + * priority mask to userspace using the lower bits in the + * unsigned long. + */ + val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >> + GICV_PMR_PRIORITY_SHIFT; + break; + case GIC_CPU_BINPOINT: + val = vmcr.bpr; + break; + case GIC_CPU_ALIAS_BINPOINT: + val = vmcr.abpr; + break; + case GIC_CPU_IDENT: + val = ((PRODUCT_ID_KVM << 20) | + (GICC_ARCH_VERSION_V2 << 16) | + IMPLEMENTER_ARM); + break; + default: + return 0; + } + + return val; +} + +static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + + switch (addr & 0xff) { + case GIC_CPU_CTRL: + vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0); + vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1); + vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl); + vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn); + vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR); + vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS); + + break; + case GIC_CPU_PRIMASK: + /* + * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the + * PMR field as GICH_VMCR.VMPriMask rather than + * GICC_PMR.Priority, so we expose the upper five bits of + * priority mask to userspace using the lower bits in the + * unsigned long. + */ + vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) & + GICV_PMR_PRIORITY_MASK; + break; + case GIC_CPU_BINPOINT: + vmcr.bpr = val; + break; + case GIC_CPU_ALIAS_BINPOINT: + vmcr.abpr = val; + break; + } + + vgic_set_vmcr(vcpu, &vmcr); +} + +static void vgic_mmio_write_dir(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_deactivate(vcpu, val); + else + vgic_v3_deactivate(vcpu, val); +} + +static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + int n; /* which APRn is this */ + + n = (addr >> 2) & 0x3; + + if (kvm_vgic_global_state.type == VGIC_V2) { + /* GICv2 hardware systems support max. 32 groups */ + if (n != 0) + return 0; + return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr; + } else { + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (n > vgic_v3_max_apr_idx(vcpu)) + return 0; + + n = array_index_nospec(n, 4); + + /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ + return vgicv3->vgic_ap1r[n]; + } +} + +static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int n; /* which APRn is this */ + + n = (addr >> 2) & 0x3; + + if (kvm_vgic_global_state.type == VGIC_V2) { + /* GICv2 hardware systems support max. 32 groups */ + if (n != 0) + return; + vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val; + } else { + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (n > vgic_v3_max_apr_idx(vcpu)) + return; + + n = array_index_nospec(n, 4); + + /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ + vgicv3->vgic_ap1r[n] = val; + } +} + +static const struct vgic_register_region vgic_v2_dist_registers[] = { + REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL, + vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, + NULL, vgic_mmio_uaccess_write_v2_misc, + 12, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP, + vgic_mmio_read_group, vgic_mmio_write_group, + NULL, vgic_mmio_uaccess_write_v2_group, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, + vgic_mmio_read_pending, vgic_mmio_write_spending, + vgic_uaccess_read_pending, vgic_uaccess_write_spending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + vgic_uaccess_read_pending, vgic_uaccess_write_cpending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, + vgic_mmio_read_active, vgic_mmio_write_sactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR, + vgic_mmio_read_active, vgic_mmio_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI, + vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, + 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET, + vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG, + vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT, + vgic_mmio_read_raz, vgic_mmio_write_sgir, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR, + vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET, + vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), +}; + +static const struct vgic_register_region vgic_v2_cpu_registers[] = { + REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO, + vgic_mmio_read_apr, vgic_mmio_write_apr, 16, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_CPU_DEACTIVATE, + vgic_mmio_read_raz, vgic_mmio_write_dir, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, + 4, VGIC_ACCESS_32bit), +}; + +unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v2_dist_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return SZ_4K; +} + +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v2_cpu_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return KVM_VGIC_V2_CPU_SIZE; +} + +int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + struct vgic_io_device iodev; + struct vgic_reg_attr reg_attr; + struct kvm_vcpu *vcpu; + gpa_t addr; + int ret; + + ret = vgic_v2_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + iodev.regions = vgic_v2_dist_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + iodev.base_addr = 0; + break; + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + iodev.regions = vgic_v2_cpu_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + iodev.base_addr = 0; + break; + default: + return -ENXIO; + } + + /* We only support aligned 32-bit accesses. */ + if (addr & 3) + return -ENXIO; + + region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); + if (!region) + return -ENXIO; + + return 0; +} + +int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v2_cpu_registers, + .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers), + .iodev_type = IODEV_CPUIF, + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} + +int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v2_dist_registers, + .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers), + .iodev_type = IODEV_DIST, + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c new file mode 100644 index 000000000000..70d50c77e5dc --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -0,0 +1,1172 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGICv3 MMIO handling functions + */ + +#include <linux/bitfield.h> +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <kvm/iodev.h> +#include <kvm/arm_vgic.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +/* extract @num bytes at @offset bytes offset in data */ +unsigned long extract_bytes(u64 data, unsigned int offset, + unsigned int num) +{ + return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); +} + +/* allows updates of any half of a 64-bit register (or the whole thing) */ +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val) +{ + int lower = (offset & 4) * 8; + int upper = lower + 8 * len - 1; + + reg &= ~GENMASK_ULL(upper, lower); + val &= GENMASK_ULL(len * 8 - 1, 0); + + return reg | ((u64)val << lower); +} + +bool vgic_has_its(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return false; + + return dist->has_its; +} + +bool vgic_supports_direct_msis(struct kvm *kvm) +{ + /* + * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware, + * indirectly allowing userspace to control whether or not vPEs are + * allocated for the VM. + */ + if (system_supports_direct_sgis() && !vgic_supports_direct_sgis(kvm)) + return false; + + return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm); +} + +bool system_supports_direct_sgis(void) +{ + return kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi(); +} + +bool vgic_supports_direct_sgis(struct kvm *kvm) +{ + return kvm->arch.vgic.nassgicap; +} + +/* + * The Revision field in the IIDR have the following meanings: + * + * Revision 2: Interrupt groups are guest-configurable and signaled using + * their configured groups. + */ + +static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + u32 value = 0; + + switch (addr & 0x0c) { + case GICD_CTLR: + if (vgic->enabled) + value |= GICD_CTLR_ENABLE_SS_G1; + value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; + if (vgic->nassgireq) + value |= GICD_CTLR_nASSGIreq; + break; + case GICD_TYPER: + value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; + value = (value >> 5) - 1; + if (vgic_has_its(vcpu->kvm)) { + value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; + value |= GICD_TYPER_LPIS; + } else { + value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; + } + break; + case GICD_TYPER2: + if (vgic_supports_direct_sgis(vcpu->kvm)) + value = GICD_TYPER2_nASSGIcap; + break; + case GICD_IIDR: + value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | + (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | + (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); + break; + default: + return 0; + } + + return value; +} + +static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + switch (addr & 0x0c) { + case GICD_CTLR: { + bool was_enabled, is_hwsgi; + + mutex_lock(&vcpu->kvm->arch.config_lock); + + was_enabled = dist->enabled; + is_hwsgi = dist->nassgireq; + + dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; + + /* Not a GICv4.1? No HW SGIs */ + if (!vgic_supports_direct_sgis(vcpu->kvm)) + val &= ~GICD_CTLR_nASSGIreq; + + /* Dist stays enabled? nASSGIreq is RO */ + if (was_enabled && dist->enabled) { + val &= ~GICD_CTLR_nASSGIreq; + val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi); + } + + /* Switching HW SGIs? */ + dist->nassgireq = val & GICD_CTLR_nASSGIreq; + if (is_hwsgi != dist->nassgireq) + vgic_v4_configure_vsgis(vcpu->kvm); + + if (vgic_supports_direct_sgis(vcpu->kvm) && + was_enabled != dist->enabled) + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4); + else if (!was_enabled && dist->enabled) + vgic_kick_vcpus(vcpu->kvm); + + mutex_unlock(&vcpu->kvm->arch.config_lock); + break; + } + case GICD_TYPER: + case GICD_TYPER2: + case GICD_IIDR: + /* This is at best for documentation purposes... */ + return; + } +} + +static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u32 reg; + + switch (addr & 0x0c) { + case GICD_TYPER2: + reg = vgic_mmio_read_v3_misc(vcpu, addr, len); + + if (reg == val) + return 0; + if (vgic_initialized(vcpu->kvm)) + return -EBUSY; + if ((reg ^ val) & ~GICD_TYPER2_nASSGIcap) + return -EINVAL; + if (!system_supports_direct_sgis() && val) + return -EINVAL; + + dist->nassgicap = val & GICD_TYPER2_nASSGIcap; + return 0; + case GICD_IIDR: + reg = vgic_mmio_read_v3_misc(vcpu, addr, len); + if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK) + return -EINVAL; + + reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg); + switch (reg) { + case KVM_VGIC_IMP_REV_2: + case KVM_VGIC_IMP_REV_3: + dist->implementation_rev = reg; + return 0; + default: + return -EINVAL; + } + case GICD_CTLR: + /* Not a GICv4.1? No HW SGIs */ + if (!vgic_supports_direct_sgis(vcpu->kvm)) + val &= ~GICD_CTLR_nASSGIreq; + + dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; + dist->nassgireq = val & GICD_CTLR_nASSGIreq; + return 0; + } + + vgic_mmio_write_v3_misc(vcpu, addr, len, val); + return 0; +} + +static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + int intid = VGIC_ADDR_TO_INTID(addr, 64); + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid); + unsigned long ret = 0; + + if (!irq) + return 0; + + /* The upper word is RAZ for us. */ + if (!(addr & 4)) + ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); + + vgic_put_irq(vcpu->kvm, irq); + return ret; +} + +static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int intid = VGIC_ADDR_TO_INTID(addr, 64); + struct vgic_irq *irq; + unsigned long flags; + + /* The upper word is WI for us since we don't implement Aff3. */ + if (addr & 4) + return; + + irq = vgic_get_irq(vcpu->kvm, intid); + + if (!irq) + return; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* We only care about and preserve Aff0, Aff1 and Aff2. */ + irq->mpidr = val & GENMASK(23, 0); + irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); +} + +bool vgic_lpis_enabled(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + return atomic_read(&vgic_cpu->ctlr) == GICR_CTLR_ENABLE_LPIS; +} + +static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + unsigned long val; + + val = atomic_read(&vgic_cpu->ctlr); + if (vgic_get_implementation_rev(vcpu) >= KVM_VGIC_IMP_REV_3) + val |= GICR_CTLR_IR | GICR_CTLR_CES; + + return val; +} + +static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u32 ctlr; + + if (!vgic_has_its(vcpu->kvm)) + return; + + if (!(val & GICR_CTLR_ENABLE_LPIS)) { + /* + * Don't disable if RWP is set, as there already an + * ongoing disable. Funky guest... + */ + ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, + GICR_CTLR_ENABLE_LPIS, + GICR_CTLR_RWP); + if (ctlr != GICR_CTLR_ENABLE_LPIS) + return; + + vgic_flush_pending_lpis(vcpu); + vgic_its_invalidate_all_caches(vcpu->kvm); + atomic_set_release(&vgic_cpu->ctlr, 0); + } else { + ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0, + GICR_CTLR_ENABLE_LPIS); + if (ctlr != 0) + return; + + vgic_enable_lpis(vcpu); + } +} + +static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu) +{ + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_redist_region *iter, *rdreg = vgic_cpu->rdreg; + + if (!rdreg) + return false; + + if (vgic_cpu->rdreg_index < rdreg->free_index - 1) { + return false; + } else if (rdreg->count && vgic_cpu->rdreg_index == (rdreg->count - 1)) { + struct list_head *rd_regions = &vgic->rd_regions; + gpa_t end = rdreg->base + rdreg->count * KVM_VGIC_V3_REDIST_SIZE; + + /* + * the rdist is the last one of the redist region, + * check whether there is no other contiguous rdist region + */ + list_for_each_entry(iter, rd_regions, list) { + if (iter->base == end && iter->free_index > 0) + return false; + } + } + return true; +} + +static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + int target_vcpu_id = vcpu->vcpu_id; + u64 value; + + value = (u64)(mpidr & GENMASK(23, 0)) << 32; + value |= ((target_vcpu_id & 0xffff) << 8); + + if (vgic_has_its(vcpu->kvm)) + value |= GICR_TYPER_PLPIS; + + if (vgic_mmio_vcpu_rdist_is_last(vcpu)) + value |= GICR_TYPER_LAST; + + return extract_bytes(value, addr & 7, len); +} + +static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); +} + +static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + switch (addr & 0xffff) { + case GICD_PIDR2: + /* report a GICv3 compliant implementation */ + return 0x3b; + } + + return 0; +} + +static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int ret; + + ret = vgic_uaccess_write_spending(vcpu, addr, len, val); + if (ret) + return ret; + + return vgic_uaccess_write_cpending(vcpu, addr, len, ~val); +} + +/* We want to avoid outer shareable. */ +u64 vgic_sanitise_shareability(u64 field) +{ + switch (field) { + case GIC_BASER_OuterShareable: + return GIC_BASER_InnerShareable; + default: + return field; + } +} + +/* Avoid any inner non-cacheable mapping. */ +u64 vgic_sanitise_inner_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_nCnB: + case GIC_BASER_CACHE_nC: + return GIC_BASER_CACHE_RaWb; + default: + return field; + } +} + +/* Non-cacheable or same-as-inner are OK. */ +u64 vgic_sanitise_outer_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_SameAsInner: + case GIC_BASER_CACHE_nC: + return field; + default: + return GIC_BASER_CACHE_SameAsInner; + } +} + +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)) +{ + u64 field = (reg & field_mask) >> field_shift; + + field = sanitise_fn(field) << field_shift; + return (reg & ~field_mask) | field; +} + +#define PROPBASER_RES0_MASK \ + (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5)) +#define PENDBASER_RES0_MASK \ + (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \ + GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0)) + +static u64 vgic_sanitise_pendbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK, + GICR_PENDBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK, + GICR_PENDBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK, + GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PENDBASER_RES0_MASK; + + return reg; +} + +static u64 vgic_sanitise_propbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK, + GICR_PROPBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK, + GICR_PROPBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK, + GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PROPBASER_RES0_MASK; + return reg; +} + +static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return extract_bytes(dist->propbaser, addr & 7, len); +} + +static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u64 old_propbaser, propbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_lpis_enabled(vcpu)) + return; + + do { + old_propbaser = READ_ONCE(dist->propbaser); + propbaser = old_propbaser; + propbaser = update_64bit_reg(propbaser, addr & 4, len, val); + propbaser = vgic_sanitise_propbaser(propbaser); + } while (cmpxchg64(&dist->propbaser, old_propbaser, + propbaser) != old_propbaser); +} + +static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 value = vgic_cpu->pendbaser; + + value &= ~GICR_PENDBASER_PTZ; + + return extract_bytes(value, addr & 7, len); +} + +static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 old_pendbaser, pendbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_lpis_enabled(vcpu)) + return; + + do { + old_pendbaser = READ_ONCE(vgic_cpu->pendbaser); + pendbaser = old_pendbaser; + pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); + pendbaser = vgic_sanitise_pendbaser(pendbaser); + } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser, + pendbaser) != old_pendbaser); +} + +static unsigned long vgic_mmio_read_sync(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return !!atomic_read(&vcpu->arch.vgic_cpu.syncr_busy); +} + +static void vgic_set_rdist_busy(struct kvm_vcpu *vcpu, bool busy) +{ + if (busy) { + atomic_inc(&vcpu->arch.vgic_cpu.syncr_busy); + smp_mb__after_atomic(); + } else { + smp_mb__before_atomic(); + atomic_dec(&vcpu->arch.vgic_cpu.syncr_busy); + } +} + +static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_irq *irq; + u32 intid; + + /* + * If the guest wrote only to the upper 32bit part of the + * register, drop the write on the floor, as it is only for + * vPEs (which we don't support for obvious reasons). + * + * Also discard the access if LPIs are not enabled. + */ + if ((addr & 4) || !vgic_lpis_enabled(vcpu)) + return; + + intid = lower_32_bits(val); + if (intid < VGIC_MIN_LPI) + return; + + vgic_set_rdist_busy(vcpu, true); + + irq = vgic_get_irq(vcpu->kvm, intid); + if (irq) { + vgic_its_inv_lpi(vcpu->kvm, irq); + vgic_put_irq(vcpu->kvm, irq); + } + + vgic_set_rdist_busy(vcpu, false); +} + +static void vgic_mmio_write_invall(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + /* See vgic_mmio_write_invlpi() for the early return rationale */ + if ((addr & 4) || !vgic_lpis_enabled(vcpu)) + return; + + vgic_set_rdist_busy(vcpu, true); + vgic_its_invall(vcpu); + vgic_set_rdist_busy(vcpu, false); +} + +/* + * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the + * redistributors, while SPIs are covered by registers in the distributor + * block. Trying to set private IRQs in this block gets ignored. + * We take some special care here to fix the calculation of the register + * offset. + */ +#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = bpi, \ + .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ + .access_flags = acc, \ + .read = vgic_mmio_read_raz, \ + .write = vgic_mmio_write_wi, \ + }, { \ + .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ + .bits_per_irq = bpi, \ + .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = ur, \ + .uaccess_write = uw, \ + } + +static const struct vgic_register_region vgic_v3_dist_registers[] = { + REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR, + vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, + NULL, vgic_mmio_uaccess_write_v3_misc, + 16, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICD_STATUSR, + vgic_mmio_read_rao, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, + vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, + vgic_mmio_read_pending, vgic_mmio_write_spending, + vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, + vgic_mmio_read_active, vgic_mmio_write_sactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, + vgic_mmio_read_active, vgic_mmio_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, + 1, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, + vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, + 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, + vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, + vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, + vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, + vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, + vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, + VGIC_ACCESS_32bit), +}; + +static const struct vgic_register_region vgic_v3_rd_registers[] = { + /* RD_base registers */ + REGISTER_DESC_WITH_LENGTH(GICR_CTLR, + vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_STATUSR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IIDR, + vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER, + vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, + NULL, vgic_mmio_uaccess_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_WAKER, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, + vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, + vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_INVLPIR, + vgic_mmio_read_raz, vgic_mmio_write_invlpi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_INVALLR, + vgic_mmio_read_raz, vgic_mmio_write_invall, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_SYNCR, + vgic_mmio_read_sync, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, + vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, + VGIC_ACCESS_32bit), + /* SGI_base registers */ + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0, + vgic_mmio_read_group, vgic_mmio_write_group, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_spending, + vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0, + vgic_mmio_read_active, vgic_mmio_write_sactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0, + vgic_mmio_read_active, vgic_mmio_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0, + vgic_mmio_read_priority, vgic_mmio_write_priority, 32, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0, + vgic_mmio_read_config, vgic_mmio_write_config, 8, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), +}; + +unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v3_dist_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return SZ_64K; +} + +/** + * vgic_register_redist_iodev - register a single redist iodev + * @vcpu: The VCPU to which the redistributor belongs + * + * Register a KVM iodev for this VCPU's redistributor using the address + * provided. + * + * Return 0 on success, -ERRNO otherwise. + */ +int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct vgic_dist *vgic = &kvm->arch.vgic; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; + struct vgic_redist_region *rdreg; + gpa_t rd_base; + int ret = 0; + + lockdep_assert_held(&kvm->slots_lock); + mutex_lock(&kvm->arch.config_lock); + + if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) + goto out_unlock; + + /* + * We may be creating VCPUs before having set the base address for the + * redistributor region, in which case we will come back to this + * function for all VCPUs when the base address is set. Just return + * without doing any work for now. + */ + rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions); + if (!rdreg) + goto out_unlock; + + if (!vgic_v3_check_base(kvm)) { + ret = -EINVAL; + goto out_unlock; + } + + vgic_cpu->rdreg = rdreg; + vgic_cpu->rdreg_index = rdreg->free_index; + + rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE; + + kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); + rd_dev->base_addr = rd_base; + rd_dev->iodev_type = IODEV_REDIST; + rd_dev->regions = vgic_v3_rd_registers; + rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); + rd_dev->redist_vcpu = vcpu; + + mutex_unlock(&kvm->arch.config_lock); + + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, + 2 * SZ_64K, &rd_dev->dev); + if (ret) + return ret; + + /* Protected by slots_lock */ + rdreg->free_index++; + return 0; + +out_unlock: + mutex_unlock(&kvm->arch.config_lock); + return ret; +} + +void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) +{ + struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; + + kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev); +} + +static int vgic_register_all_redist_iodevs(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long c; + int ret = 0; + + lockdep_assert_held(&kvm->slots_lock); + + kvm_for_each_vcpu(c, vcpu, kvm) { + ret = vgic_register_redist_iodev(vcpu); + if (ret) + break; + } + + if (ret) { + /* The current c failed, so iterate over the previous ones. */ + int i; + + for (i = 0; i < c; i++) { + vcpu = kvm_get_vcpu(kvm, i); + vgic_unregister_redist_iodev(vcpu); + } + } + + return ret; +} + +/** + * vgic_v3_alloc_redist_region - Allocate a new redistributor region + * + * Performs various checks before inserting the rdist region in the list. + * Those tests depend on whether the size of the rdist region is known + * (ie. count != 0). The list is sorted by rdist region index. + * + * @kvm: kvm handle + * @index: redist region index + * @base: base of the new rdist region + * @count: number of redistributors the region is made of (0 in the old style + * single region, whose size is induced from the number of vcpus) + * + * Return 0 on success, < 0 otherwise + */ +static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, + gpa_t base, uint32_t count) +{ + struct vgic_dist *d = &kvm->arch.vgic; + struct vgic_redist_region *rdreg; + struct list_head *rd_regions = &d->rd_regions; + int nr_vcpus = atomic_read(&kvm->online_vcpus); + size_t size = count ? count * KVM_VGIC_V3_REDIST_SIZE + : nr_vcpus * KVM_VGIC_V3_REDIST_SIZE; + int ret; + + /* cross the end of memory ? */ + if (base + size < base) + return -EINVAL; + + if (list_empty(rd_regions)) { + if (index != 0) + return -EINVAL; + } else { + rdreg = list_last_entry(rd_regions, + struct vgic_redist_region, list); + + /* Don't mix single region and discrete redist regions */ + if (!count && rdreg->count) + return -EINVAL; + + if (!count) + return -EEXIST; + + if (index != rdreg->index + 1) + return -EINVAL; + } + + /* + * For legacy single-region redistributor regions (!count), + * check that the redistributor region does not overlap with the + * distributor's address space. + */ + if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && + vgic_dist_overlap(kvm, base, size)) + return -EINVAL; + + /* collision with any other rdist region? */ + if (vgic_v3_rdist_overlap(kvm, base, size)) + return -EINVAL; + + rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL_ACCOUNT); + if (!rdreg) + return -ENOMEM; + + rdreg->base = VGIC_ADDR_UNDEF; + + ret = vgic_check_iorange(kvm, rdreg->base, base, SZ_64K, size); + if (ret) + goto free; + + rdreg->base = base; + rdreg->count = count; + rdreg->free_index = 0; + rdreg->index = index; + + list_add_tail(&rdreg->list, rd_regions); + return 0; +free: + kfree(rdreg); + return ret; +} + +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg) +{ + struct kvm_vcpu *vcpu; + unsigned long c; + + lockdep_assert_held(&kvm->arch.config_lock); + + /* Garbage collect the region */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (vcpu->arch.vgic_cpu.rdreg == rdreg) + vcpu->arch.vgic_cpu.rdreg = NULL; + } + + list_del(&rdreg->list); + kfree(rdreg); +} + +int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) +{ + int ret; + + mutex_lock(&kvm->arch.config_lock); + ret = vgic_v3_alloc_redist_region(kvm, index, addr, count); + mutex_unlock(&kvm->arch.config_lock); + if (ret) + return ret; + + /* + * Register iodevs for each existing VCPU. Adding more VCPUs + * afterwards will register the iodevs when needed. + */ + ret = vgic_register_all_redist_iodevs(kvm); + if (ret) { + struct vgic_redist_region *rdreg; + + mutex_lock(&kvm->arch.config_lock); + rdreg = vgic_v3_rdist_region_from_index(kvm, index); + vgic_v3_free_redist_region(kvm, rdreg); + mutex_unlock(&kvm->arch.config_lock); + return ret; + } + + return 0; +} + +int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + struct vgic_io_device iodev; + struct vgic_reg_attr reg_attr; + struct kvm_vcpu *vcpu; + gpa_t addr; + int ret; + + ret = vgic_v3_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + iodev.regions = vgic_v3_dist_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); + iodev.base_addr = 0; + break; + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{ + iodev.regions = vgic_v3_rd_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); + iodev.base_addr = 0; + break; + } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + return vgic_v3_has_cpu_sysregs_attr(vcpu, attr); + default: + return -ENXIO; + } + + /* We only support aligned 32-bit accesses. */ + if (addr & 3) + return -ENXIO; + + region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); + if (!region) + return -ENXIO; + + return 0; +} + +/* + * The ICC_SGI* registers encode the affinity differently from the MPIDR, + * so provide a wrapper to use the existing defines to isolate a certain + * affinity level. + */ +#define SGI_AFFINITY_LEVEL(reg, level) \ + ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ + >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) + +static void vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, u32 sgi, bool allow_group1) +{ + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, sgi); + unsigned long flags; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* + * An access targeting Group0 SGIs can only generate + * those, while an access targeting Group1 SGIs can + * generate interrupts of either group. + */ + if (!irq->group || allow_group1) { + if (!irq->hw) { + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + /* HW SGI? Ask the GIC to inject it */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + true); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + } else { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + + vgic_put_irq(vcpu->kvm, irq); +} + +/** + * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs + * @vcpu: The VCPU requesting a SGI + * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU + * @allow_group1: Does the sysreg access allow generation of G1 SGIs + * + * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. + * This will trap in sys_regs.c and call this function. + * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the + * target processors as well as a bitmask of 16 Aff0 CPUs. + * + * If the interrupt routing mode bit is not set, we iterate over the Aff0 + * bits and signal the VCPUs matching the provided Aff{3,2,1}. + * + * If this bit is set, we signal all, but not the calling VCPU. + */ +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *c_vcpu; + unsigned long target_cpus; + u64 mpidr; + u32 sgi, aff0; + unsigned long c; + + sgi = FIELD_GET(ICC_SGI1R_SGI_ID_MASK, reg); + + /* Broadcast */ + if (unlikely(reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT))) { + kvm_for_each_vcpu(c, c_vcpu, kvm) { + /* Don't signal the calling VCPU */ + if (c_vcpu == vcpu) + continue; + + vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1); + } + + return; + } + + /* We iterate over affinities to find the corresponding vcpus */ + mpidr = SGI_AFFINITY_LEVEL(reg, 3); + mpidr |= SGI_AFFINITY_LEVEL(reg, 2); + mpidr |= SGI_AFFINITY_LEVEL(reg, 1); + target_cpus = FIELD_GET(ICC_SGI1R_TARGET_LIST_MASK, reg); + + for_each_set_bit(aff0, &target_cpus, hweight_long(ICC_SGI1R_TARGET_LIST_MASK)) { + c_vcpu = kvm_mpidr_to_vcpu(kvm, mpidr | aff0); + if (c_vcpu) + vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1); + } +} + +int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v3_dist_registers, + .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers), + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} + +int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device rd_dev = { + .regions = vgic_v3_rd_registers, + .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers), + }; + + return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); +} + +int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u32 intid, u32 *val) +{ + if (intid % 32) + return -EINVAL; + + if (is_write) + vgic_write_irq_line_level_info(vcpu, intid, *val); + else + *val = vgic_read_irq_line_level_info(vcpu, intid); + + return 0; +} diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c new file mode 100644 index 000000000000..a573b1f0c6cb --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -0,0 +1,1103 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGIC MMIO handling functions + */ + +#include <linux/bitops.h> +#include <linux/bsearch.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/iodev.h> +#include <kvm/arm_arch_timer.h> +#include <kvm/arm_vgic.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return 0; +} + +unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return -1UL; +} + +void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + /* Ignore */ +} + +int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + /* Ignore */ + return 0; +} + +unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + if (irq->group) + value |= BIT(i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +static void vgic_update_vsgi(struct vgic_irq *irq) +{ + WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group)); +} + +void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->group = !!(val & BIT(i)); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + vgic_update_vsgi(irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } else { + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } + + vgic_put_irq(vcpu->kvm, irq); + } +} + +/* + * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value + * of the enabled bit, so there is only one function for both here. + */ +unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + if (irq->enabled) + value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + if (!irq->enabled) { + struct irq_data *data; + + irq->enabled = true; + data = &irq_to_desc(irq->host_irq)->irq_data; + while (irqd_irq_disabled(data)) + enable_irq(irq->host_irq); + } + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + continue; + } else if (vgic_irq_is_mapped_level(irq)) { + bool was_high = irq->line_level; + + /* + * We need to update the state of the interrupt because + * the guest might have changed the state of the device + * while the interrupt was disabled at the VGIC level. + */ + irq->line_level = vgic_get_phys_line_level(irq); + /* + * Deactivate the physical interrupt so the GIC will let + * us know when it is asserted again. + */ + if (!irq->active && was_high && !irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + irq->enabled = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled) + disable_irq_nosync(irq->host_irq); + + irq->enabled = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->enabled = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->enabled = false; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +static unsigned long __read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + bool is_user) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + unsigned long flags; + bool val; + + /* + * When used from userspace with a GICv3 model: + * + * Pending state of interrupt is latched in pending_latch + * variable. Userspace will save and restore pending state + * and line_level separately. + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst + * for handling of ISPENDR and ICPENDR. + */ + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + int err; + + val = false; + err = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &val); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + } else if (!is_user && vgic_irq_is_mapped_level(irq)) { + val = vgic_get_phys_line_level(irq); + } else { + switch (vcpu->kvm->arch.vgic.vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + if (is_user) { + val = irq->pending_latch; + break; + } + fallthrough; + default: + val = irq_is_pending(irq); + break; + } + } + + value |= ((u32)val << i); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return __read_pending(vcpu, addr, len, false); +} + +unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return __read_pending(vcpu, addr, len, true); +} + +static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) +{ + return (vgic_irq_is_sgi(irq->intid) && + vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2); +} + +static void __set_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, + unsigned long val, bool is_user) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + /* GICD_ISPENDR0 SGI bits are WI when written from the guest. */ + if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { + vgic_put_irq(vcpu->kvm, irq); + continue; + } + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* + * GICv2 SGIs are terribly broken. We can't restore + * the source of the interrupt, so just pick the vcpu + * itself as the source... + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source |= BIT(vcpu->vcpu_id); + + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + /* HW SGI? Ask the GIC to inject it */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + true); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + continue; + } + + irq->pending_latch = true; + if (irq->hw && !is_user) + vgic_irq_set_phys_active(irq, true); + + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __set_pending(vcpu, addr, len, val, false); +} + +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __set_pending(vcpu, addr, len, val, true); + return 0; +} + +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq) +{ + irq->pending_latch = false; + + /* + * We don't want the guest to effectively mask the physical + * interrupt by doing a write to SPENDR followed by a write to + * CPENDR for HW interrupts, so we clear the active state on + * the physical side if the virtual interrupt is not active. + * This may lead to taking an additional interrupt on the + * host, but that should not be a problem as the worst that + * can happen is an additional vgic injection. We also clear + * the pending state to maintain proper semantics for edge HW + * interrupts. + */ + vgic_irq_set_phys_pending(irq, false); + if (!irq->active) + vgic_irq_set_phys_active(irq, false); +} + +static void __clear_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val, bool is_user) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + /* GICD_ICPENDR0 SGI bits are WI when written from the guest. */ + if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { + vgic_put_irq(vcpu->kvm, irq); + continue; + } + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* + * More fun with GICv2 SGIs! If we're clearing one of them + * from userspace, which source vcpu to clear? Let's not + * even think of it, and blow the whole set. + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source = 0; + + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + /* HW SGI? Ask the GIC to clear its pending bit */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + false); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + continue; + } + + if (irq->hw && !is_user) + vgic_hw_irq_cpending(vcpu, irq); + else + irq->pending_latch = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __clear_pending(vcpu, addr, len, val, false); +} + +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __clear_pending(vcpu, addr, len, val, true); + return 0; +} + +/* + * If we are fiddling with an IRQ's active state, we have to make sure the IRQ + * is not queued on some running VCPU's LRs, because then the change to the + * active state can be overwritten when the VCPU's state is synced coming back + * from the guest. + * + * For shared interrupts as well as GICv3 private interrupts accessed from the + * non-owning CPU, we have to stop all the VCPUs because interrupts can be + * migrated while we don't hold the IRQ locks and we don't want to be chasing + * moving targets. + * + * For GICv2 private interrupts we don't have to do anything because + * userspace accesses to the VGIC state already require all VCPUs to be + * stopped, and only the VCPU itself can modify its private interrupts + * active state, which guarantees that the VCPU is not running. + */ +static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) +{ + if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && + vcpu != kvm_get_running_vcpu()) || + intid >= VGIC_NR_PRIVATE_IRQS) + kvm_arm_halt_guest(vcpu->kvm); +} + +/* See vgic_access_active_prepare */ +static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid) +{ + if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && + vcpu != kvm_get_running_vcpu()) || + intid >= VGIC_NR_PRIVATE_IRQS) + kvm_arm_resume_guest(vcpu->kvm); +} + +static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + /* + * Even for HW interrupts, don't evaluate the HW state as + * all the guest is interested in is the virtual state. + */ + if (irq->active) + value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 val; + + mutex_lock(&vcpu->kvm->arch.config_lock); + vgic_access_active_prepare(vcpu, intid); + + val = __vgic_mmio_read_active(vcpu, addr, len); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return val; +} + +unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return __vgic_mmio_read_active(vcpu, addr, len); +} + +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool active, bool is_uaccess) +{ + if (is_uaccess) + return; + + irq->active = active; + vgic_irq_set_phys_active(irq, active); +} + +static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool active) +{ + unsigned long flags; + struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu(); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw && !vgic_irq_is_sgi(irq->intid)) { + vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); + } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + /* + * GICv4.1 VSGI feature doesn't track an active state, + * so let's not kid ourselves, there is nothing we can + * do here. + */ + irq->active = false; + } else { + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u8 active_source; + + irq->active = active; + + /* + * The GICv2 architecture indicates that the source CPUID for + * an SGI should be provided during an EOI which implies that + * the active state is stored somewhere, but at the same time + * this state is not architecturally exposed anywhere and we + * have no way of knowing the right source. + * + * This may lead to a VCPU not being able to receive + * additional instances of a particular SGI after migration + * for a GICv2 VM on some GIC implementations. Oh well. + */ + active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && + active && vgic_irq_is_sgi(irq->intid)) + irq->active_source = active_source; + } + + if (irq->active) + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + else + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); +} + +static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + vgic_mmio_change_active(vcpu, irq, false); + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + + mutex_lock(&vcpu->kvm->arch.config_lock); + vgic_access_active_prepare(vcpu, intid); + + __vgic_mmio_write_cactive(vcpu, addr, len, val); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->arch.config_lock); +} + +int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __vgic_mmio_write_cactive(vcpu, addr, len, val); + return 0; +} + +static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + vgic_mmio_change_active(vcpu, irq, true); + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + + mutex_lock(&vcpu->kvm->arch.config_lock); + vgic_access_active_prepare(vcpu, intid); + + __vgic_mmio_write_sactive(vcpu, addr, len, val); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->arch.config_lock); +} + +int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __vgic_mmio_write_sactive(vcpu, addr, len, val); + return 0; +} + +unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + val |= (u64)irq->priority << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +/* + * We currently don't handle changing the priority of an interrupt that + * is already pending on a VCPU. If there is a need for this, we would + * need to make this VCPU exit and re-evaluate the priorities, potentially + * leading to this interrupt getting presented now to the guest (if it has + * been masked by the priority mask before). + */ +void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + unsigned long flags; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* Narrow the priority range to what we actually support */ + irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) + vgic_update_vsgi(irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + +unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 2); + u32 value = 0; + int i; + + for (i = 0; i < len * 4; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); + + if (irq->config == VGIC_CONFIG_EDGE) + value |= (2U << (i * 2)); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +void vgic_mmio_write_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 2); + int i; + unsigned long flags; + + for (i = 0; i < len * 4; i++) { + struct vgic_irq *irq; + + /* + * The configuration cannot be changed for SGIs in general, + * for PPIs this is IMPLEMENTATION DEFINED. The arch timer + * code relies on PPIs being level triggered, so we also + * make them read-only here. + */ + if (intid + i < VGIC_NR_PRIVATE_IRQS) + continue; + + irq = vgic_get_irq(vcpu->kvm, intid + i); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (test_bit(i * 2 + 1, &val)) + irq->config = VGIC_CONFIG_EDGE; + else + irq->config = VGIC_CONFIG_LEVEL; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) +{ + int i; + u32 val = 0; + int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + + for (i = 0; i < 32; i++) { + struct vgic_irq *irq; + + if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) + continue; + + irq = vgic_get_vcpu_irq(vcpu, intid + i); + if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) + val |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, + const u32 val) +{ + int i; + int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + unsigned long flags; + + for (i = 0; i < 32; i++) { + struct vgic_irq *irq; + bool new_level; + + if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) + continue; + + irq = vgic_get_vcpu_irq(vcpu, intid + i); + + /* + * Line level is set irrespective of irq type + * (level or edge) to avoid dependency that VM should + * restore irq config before line level. + */ + new_level = !!(val & (1U << i)); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->line_level = new_level; + if (new_level) + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + else + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + +static int match_region(const void *key, const void *elt) +{ + const unsigned int offset = (unsigned long)key; + const struct vgic_register_region *region = elt; + + if (offset < region->reg_offset) + return -1; + + if (offset >= region->reg_offset + region->len) + return 1; + + return 0; +} + +const struct vgic_register_region * +vgic_find_mmio_region(const struct vgic_register_region *regions, + int nr_regions, unsigned int offset) +{ + return bsearch((void *)(uintptr_t)offset, regions, nr_regions, + sizeof(regions[0]), match_region); +} + +void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_vmcr(vcpu, vmcr); + else + vgic_v3_set_vmcr(vcpu, vmcr); +} + +void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_get_vmcr(vcpu, vmcr); + else + vgic_v3_get_vmcr(vcpu, vmcr); +} + +/* + * kvm_mmio_read_buf() returns a value in a format where it can be converted + * to a byte array and be directly observed as the guest wanted it to appear + * in memory if it had done the store itself, which is LE for the GIC, as the + * guest knows the GIC is always LE. + * + * We convert this value to the CPUs native format to deal with it as a data + * value. + */ +unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len) +{ + unsigned long data = kvm_mmio_read_buf(val, len); + + switch (len) { + case 1: + return data; + case 2: + return le16_to_cpu(data); + case 4: + return le32_to_cpu(data); + default: + return le64_to_cpu(data); + } +} + +/* + * kvm_mmio_write_buf() expects a value in a format such that if converted to + * a byte array it is observed as the guest would see it if it could perform + * the load directly. Since the GIC is LE, and the guest knows this, the + * guest expects a value in little endian format. + * + * We convert the data value from the CPUs native format to LE so that the + * value is returned in the proper format. + */ +void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, + unsigned long data) +{ + switch (len) { + case 1: + break; + case 2: + data = cpu_to_le16(data); + break; + case 4: + data = cpu_to_le32(data); + break; + default: + data = cpu_to_le64(data); + } + + kvm_mmio_write_buf(buf, len, data); +} + +static +struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev) +{ + return container_of(dev, struct vgic_io_device, dev); +} + +static bool check_region(const struct kvm *kvm, + const struct vgic_register_region *region, + gpa_t addr, int len) +{ + int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + + switch (len) { + case sizeof(u8): + flags = VGIC_ACCESS_8bit; + break; + case sizeof(u32): + flags = VGIC_ACCESS_32bit; + break; + case sizeof(u64): + flags = VGIC_ACCESS_64bit; + break; + default: + return false; + } + + if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) { + if (!region->bits_per_irq) + return true; + + /* Do we access a non-allocated IRQ? */ + return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs; + } + + return false; +} + +const struct vgic_register_region * +vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, int len) +{ + const struct vgic_register_region *region; + + region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, + addr - iodev->base_addr); + if (!region || !check_region(vcpu->kvm, region, addr, len)) + return NULL; + + return region; +} + +static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, u32 *val) +{ + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + + region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); + if (!region) { + *val = 0; + return 0; + } + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + if (region->uaccess_read) + *val = region->uaccess_read(r_vcpu, addr, sizeof(u32)); + else + *val = region->read(r_vcpu, addr, sizeof(u32)); + + return 0; +} + +static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, const u32 *val) +{ + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + + region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); + if (!region) + return 0; + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + if (region->uaccess_write) + return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); + + region->write(r_vcpu, addr, sizeof(u32), *val); + return 0; +} + +/* + * Userland access to VGIC registers. + */ +int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, + bool is_write, int offset, u32 *val) +{ + if (is_write) + return vgic_uaccess_write(vcpu, dev, offset, val); + else + return vgic_uaccess_read(vcpu, dev, offset, val); +} + +static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + unsigned long data = 0; + + region = vgic_get_mmio_region(vcpu, iodev, addr, len); + if (!region) { + memset(val, 0, len); + return 0; + } + + switch (iodev->iodev_type) { + case IODEV_CPUIF: + data = region->read(vcpu, addr, len); + break; + case IODEV_DIST: + data = region->read(vcpu, addr, len); + break; + case IODEV_REDIST: + data = region->read(iodev->redist_vcpu, addr, len); + break; + case IODEV_ITS: + data = region->its_read(vcpu->kvm, iodev->its, addr, len); + break; + } + + vgic_data_host_to_mmio_bus(val, len, data); + return 0; +} + +static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + unsigned long data = vgic_data_mmio_bus_to_host(val, len); + + region = vgic_get_mmio_region(vcpu, iodev, addr, len); + if (!region) + return 0; + + switch (iodev->iodev_type) { + case IODEV_CPUIF: + region->write(vcpu, addr, len, data); + break; + case IODEV_DIST: + region->write(vcpu, addr, len, data); + break; + case IODEV_REDIST: + region->write(iodev->redist_vcpu, addr, len, data); + break; + case IODEV_ITS: + region->its_write(vcpu->kvm, iodev->its, addr, len, data); + break; + } + + return 0; +} + +const struct kvm_io_device_ops kvm_io_gic_ops = { + .read = dispatch_mmio_read, + .write = dispatch_mmio_write, +}; + +int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, + enum vgic_type type) +{ + struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; + unsigned int len; + + switch (type) { + case VGIC_V2: + len = vgic_v2_init_dist_iodev(io_device); + break; + case VGIC_V3: + len = vgic_v3_init_dist_iodev(io_device); + break; + default: + BUG(); + } + + io_device->base_addr = dist_base_address; + io_device->iodev_type = IODEV_DIST; + io_device->redist_vcpu = NULL; + + return kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, + len, &io_device->dev); +} diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h new file mode 100644 index 000000000000..50dc80220b0f --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ +#ifndef __KVM_ARM_VGIC_MMIO_H__ +#define __KVM_ARM_VGIC_MMIO_H__ + +struct vgic_register_region { + unsigned int reg_offset; + unsigned int len; + unsigned int bits_per_irq; + unsigned int access_flags; + union { + unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len); + }; + union { + void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + void (*its_write)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val); + }; + unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + union { + int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val); + }; +}; + +extern const struct kvm_io_device_ops kvm_io_gic_ops; + +#define VGIC_ACCESS_8bit 1 +#define VGIC_ACCESS_32bit 2 +#define VGIC_ACCESS_64bit 4 + +/* + * Generate a mask that covers the number of bytes required to address + * up to 1024 interrupts, each represented by <bits> bits. This assumes + * that <bits> is a power of two. + */ +#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1) + +/* + * (addr & mask) gives us the _byte_ offset for the INT ID. + * We multiply this by 8 the get the _bit_ offset, then divide this by + * the number of bits to learn the actual INT ID. + * But instead of a division (which requires a "long long div" implementation), + * we shift by the binary logarithm of <bits>. + * This assumes that <bits> is a power of two. + */ +#define VGIC_ADDR_TO_INTID(addr, bits) (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \ + 8 >> ilog2(bits)) + +/* + * Some VGIC registers store per-IRQ information, with a different number + * of bits per IRQ. For those registers this macro is used. + * The _WITH_LENGTH version instantiates registers with a fixed length + * and is mutually exclusive with the _PER_IRQ version. + */ +#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = bpi, \ + .len = bpi * 1024 / 8, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = ur, \ + .uaccess_write = uw, \ + } + +#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = 0, \ + .len = length, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + } + +#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = 0, \ + .len = length, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = urd, \ + .uaccess_write = uwr, \ + } + +unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); + +void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, + unsigned long data); + +unsigned long extract_bytes(u64 data, unsigned int offset, + unsigned int num); + +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + +void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, + bool is_write, int offset, u32 *val); + +u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); + +void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, + const u32 val); + +unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev); + +unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); + +u64 vgic_sanitise_outer_cacheability(u64 reg); +u64 vgic_sanitise_inner_cacheability(u64 reg); +u64 vgic_sanitise_shareability(u64 reg); +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)); + +/* Find the proper register handler entry given a certain address offset */ +const struct vgic_register_region * +vgic_find_mmio_region(const struct vgic_register_region *regions, + int nr_regions, unsigned int offset); + +#endif diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c new file mode 100644 index 000000000000..585491fbda80 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -0,0 +1,622 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include <linux/irqchip/arm-gic.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_mmu.h> + +#include "vgic-mmio.h" +#include "vgic.h" + +static inline void vgic_v2_write_lr(int lr, u32 val) +{ + void __iomem *base = kvm_vgic_global_state.vctrl_base; + + writel_relaxed(val, base + GICH_LR0 + (lr * 4)); +} + +void vgic_v2_init_lrs(void) +{ + int i; + + for (i = 0; i < kvm_vgic_global_state.nr_lr; i++) + vgic_v2_write_lr(i, 0); +} + +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) +{ + struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + + cpuif->vgic_hcr = GICH_HCR_EN; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_UIE; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP0_MASK) ? + GICH_HCR_VGrp0DIE : GICH_HCR_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP1_MASK) ? + GICH_HCR_VGrp1DIE : GICH_HCR_VGrp1EIE; +} + +static bool lr_signals_eoi_mi(u32 lr_val) +{ + return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) && + !(lr_val & GICH_LR_HW); +} + +static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val) +{ + u32 cpuid, intid = val & GICH_LR_VIRTUALID; + struct vgic_irq *irq; + bool deactivated; + + /* Extract the source vCPU id from the LR */ + cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val) & 7; + + /* Notify fds when the guest EOI'ed a level-triggered SPI */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + + irq = vgic_get_vcpu_irq(vcpu, intid); + + scoped_guard(raw_spinlock, &irq->irq_lock) { + /* Always preserve the active bit, note deactivation */ + deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); + irq->active = !!(val & GICH_LR_ACTIVE_BIT); + + if (irq->active && vgic_irq_is_sgi(intid)) + irq->active_source = cpuid; + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & GICH_LR_PENDING_BIT)) { + irq->pending_latch = true; + + if (vgic_irq_is_sgi(intid)) + irq->source |= (1 << cpuid); + } + + /* + * Clear soft pending state when level irqs have been acked. + */ + if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) + irq->pending_latch = false; + + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); + + irq->on_lr = false; + } + + vgic_put_irq(vcpu->kvm, irq); +} + +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + +/* + * transfer the content of the LRs back into the corresponding ap_list: + * - active bit is transferred as is + * - pending bit is + * - transferred as is in case of edge sensitive IRQs + * - set to the line-level (resample time) for level sensitive IRQs + */ +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr); + struct vgic_irq *irq; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) + vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]); + + /* See the GICv3 equivalent for the EOIcount handling rationale */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u32 lr; + + if (!eoicount) { + break; + } else { + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + vgic_v2_fold_lr(vcpu, lr); + eoicount--; + } + + cpuif->used_lrs = 0; +} + +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false; + struct vgic_irq *irq; + unsigned long flags; + u64 lr = 0; + u8 cpuid; + + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); + + /* We only deal with DIR when EOIMode==1 */ + if (!(cpuif->vgic_vmcr & GICH_VMCR_EOI_MODE_MASK)) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* See the corresponding v3 code for the rationale */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; + + /* + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. + */ + if (irq->on_lr) { + mmio = true; + goto put; + } + + /* SGI: check that the cpuid matches */ + if (val < VGIC_NR_SGIS && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } + + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + + vgic_v2_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); +} + +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) +{ + u32 val = irq->intid; + bool allow_pending = true; + + WARN_ON(irq->on_lr); + + if (irq->active) { + val |= GICH_LR_ACTIVE_BIT; + if (vgic_irq_is_sgi(irq->intid)) + val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; + if (vgic_irq_is_multi_sgi(irq)) { + allow_pending = false; + val |= GICH_LR_EOI; + } + } + + if (irq->group) + val |= GICH_LR_GROUP1; + + if (irq->hw && !vgic_irq_needs_resampling(irq)) { + val |= GICH_LR_HW; + val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active) + allow_pending = false; + } else { + if (irq->config == VGIC_CONFIG_LEVEL) { + val |= GICH_LR_EOI; + + /* + * Software resampling doesn't work very well + * if we allow P+A, so let's not do that. + */ + if (irq->active) + allow_pending = false; + } + } + + if (allow_pending && irq_is_pending(irq)) { + val |= GICH_LR_PENDING_BIT; + + if (vgic_irq_is_sgi(irq->intid)) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return 0; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + if (irq->source & ~BIT(src - 1)) + val |= GICH_LR_EOI; + } + } + + /* The GICv2 LR only holds five bits of priority. */ + val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + + return val; +} + +/* + * Populates the particular LR with the state of a given IRQ: + * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq + * - for a level sensitive IRQ the pending state value is unchanged; + * it is dictated directly by the input level + * + * If @irq describes an SGI with multiple sources, we choose the + * lowest-numbered source VCPU and clear that bit in the source bitmap. + * + * The irq_lock must be held by the caller. + */ +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 val = vgic_v2_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; + + if (val & GICH_LR_PENDING_BIT) { + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch = false; + + if (vgic_irq_is_sgi(irq->intid)) { + u32 src = ffs(irq->source); + + irq->source &= ~BIT(src - 1); + if (irq->source) + irq->pending_latch = true; + } + } + + /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v2_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) + irq->line_level = false; + + /* The GICv2 LR only holds five bits of priority. */ + val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + + irq->on_lr = true; +} + +void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0; +} + +void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u32 vmcr; + + vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) & + GICH_VMCR_ENABLE_GRP0_MASK; + vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) & + GICH_VMCR_ENABLE_GRP1_MASK; + vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) & + GICH_VMCR_ACK_CTL_MASK; + vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) & + GICH_VMCR_FIQ_EN_MASK; + vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) & + GICH_VMCR_CBPR_MASK; + vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) & + GICH_VMCR_EOI_MODE_MASK; + vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & + GICH_VMCR_ALIAS_BINPOINT_MASK; + vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & + GICH_VMCR_BINPOINT_MASK; + vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) << + GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; + + cpu_if->vgic_vmcr = vmcr; +} + +void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; + + vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >> + GICH_VMCR_ENABLE_GRP0_SHIFT; + vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >> + GICH_VMCR_ENABLE_GRP1_SHIFT; + vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >> + GICH_VMCR_ACK_CTL_SHIFT; + vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >> + GICH_VMCR_FIQ_EN_SHIFT; + vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >> + GICH_VMCR_CBPR_SHIFT; + vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >> + GICH_VMCR_EOI_MODE_SHIFT; + + vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> + GICH_VMCR_ALIAS_BINPOINT_SHIFT; + vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> + GICH_VMCR_BINPOINT_SHIFT; + vmcrp->pmr = ((vmcr & GICH_VMCR_PRIMASK_MASK) >> + GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; +} + +void vgic_v2_reset(struct kvm_vcpu *vcpu) +{ + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; +} + +/* check for overlapping regions and for regions crossing the end of memory */ +static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) +{ + if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base) + return false; + if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base) + return false; + + if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base) + return true; + if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base) + return true; + + return false; +} + +int vgic_v2_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned int len; + int ret = 0; + + if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || + IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { + kvm_debug("Need to set vgic cpu and dist addresses first\n"); + return -ENXIO; + } + + if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) { + kvm_debug("VGIC CPU and dist frames overlap\n"); + return -EINVAL; + } + + /* + * Initialize the vgic if this hasn't already been done on demand by + * accessing the vgic state from userspace. + */ + ret = vgic_init(kvm); + if (ret) { + kvm_err("Unable to initialize VGIC dynamic data structures\n"); + return ret; + } + + len = vgic_v2_init_cpuif_iodev(&dist->cpuif_iodev); + dist->cpuif_iodev.base_addr = dist->vgic_cpu_base; + dist->cpuif_iodev.iodev_type = IODEV_CPUIF; + dist->cpuif_iodev.redist_vcpu = NULL; + + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist->vgic_cpu_base, + len, &dist->cpuif_iodev.dev); + if (ret) + return ret; + + if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { + ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, + kvm_vgic_global_state.vcpu_base, + KVM_VGIC_V2_CPU_SIZE - SZ_4K, true); + if (ret) { + kvm_err("Unable to remap VGIC CPU to VCPU\n"); + return ret; + } + } + + return 0; +} + +DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap); + +/** + * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller + * @info: pointer to the GIC description + * + * Returns 0 if the VGICv2 has been probed successfully, returns an error code + * otherwise + */ +int vgic_v2_probe(const struct gic_kvm_info *info) +{ + int ret; + u32 vtr; + + if (is_protected_kvm_enabled()) { + kvm_err("GICv2 not supported in protected mode\n"); + return -ENXIO; + } + + if (!info->vctrl.start) { + kvm_err("GICH not present in the firmware table\n"); + return -ENXIO; + } + + if (!PAGE_ALIGNED(info->vcpu.start) || + !PAGE_ALIGNED(resource_size(&info->vcpu))) { + kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n"); + + ret = create_hyp_io_mappings(info->vcpu.start, + resource_size(&info->vcpu), + &kvm_vgic_global_state.vcpu_base_va, + &kvm_vgic_global_state.vcpu_hyp_va); + if (ret) { + kvm_err("Cannot map GICV into hyp\n"); + goto out; + } + + static_branch_enable(&vgic_v2_cpuif_trap); + } + + ret = create_hyp_io_mappings(info->vctrl.start, + resource_size(&info->vctrl), + &kvm_vgic_global_state.vctrl_base, + &kvm_vgic_global_state.vctrl_hyp); + if (ret) { + kvm_err("Cannot map VCTRL into hyp\n"); + goto out; + } + + vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); + kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; + + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device\n"); + goto out; + } + + kvm_vgic_global_state.can_emulate_gicv2 = true; + kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.gicc_base = info->gicc_base; + kvm_vgic_global_state.type = VGIC_V2; + kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; + + kvm_debug("vgic-v2@%llx\n", info->vctrl.start); + + return 0; +out: + if (kvm_vgic_global_state.vctrl_base) + iounmap(kvm_vgic_global_state.vctrl_base); + if (kvm_vgic_global_state.vcpu_base_va) + iounmap(kvm_vgic_global_state.vcpu_base_va); + + return ret; +} + +static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u64 used_lrs = cpu_if->used_lrs; + u64 elrsr; + int i; + + elrsr = readl_relaxed(base + GICH_ELRSR0); + if (unlikely(used_lrs > 32)) + elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32; + + for (i = 0; i < used_lrs; i++) { + if (elrsr & (1UL << i)) + cpu_if->vgic_lr[i] &= ~GICH_LR_STATE; + else + cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4)); + + writel_relaxed(0, base + GICH_LR0 + (i * 4)); + } +} + +void vgic_v2_save_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + void __iomem *base = kvm_vgic_global_state.vctrl_base; + u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; + + if (!base) + return; + + cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); + + if (used_lrs) + save_lrs(vcpu, base); + + if (cpu_if->vgic_hcr & GICH_HCR_LRENPIE) { + u32 val = readl_relaxed(base + GICH_HCR); + + cpu_if->vgic_hcr &= ~GICH_HCR_EOICOUNT; + cpu_if->vgic_hcr |= val & GICH_HCR_EOICOUNT; + } + + writel_relaxed(0, base + GICH_HCR); +} + +void vgic_v2_restore_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + void __iomem *base = kvm_vgic_global_state.vctrl_base; + u64 used_lrs = cpu_if->used_lrs; + int i; + + if (!base) + return; + + writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); + + for (i = 0; i < used_lrs; i++) + writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4)); +} + +void vgic_v2_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + writel_relaxed(cpu_if->vgic_vmcr, + kvm_vgic_global_state.vctrl_base + GICH_VMCR); + writel_relaxed(cpu_if->vgic_apr, + kvm_vgic_global_state.vctrl_base + GICH_APR); +} + +void vgic_v2_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); +} diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c new file mode 100644 index 000000000000..61b44f3f2bf1 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/uaccess.h> + +#include <kvm/arm_vgic.h> + +#include <asm/kvm_arm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> + +#include "vgic.h" + +#define ICH_LRN(n) (ICH_LR0_EL2 + (n)) +#define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n)) +#define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n)) + +struct mi_state { + u16 eisr; + u16 elrsr; + bool pend; +}; + +/* + * The shadow registers loaded to the hardware when running a L2 guest + * with the virtual IMO/FMO bits set. + */ +struct shadow_if { + struct vgic_v3_cpu_if cpuif; + unsigned long lr_map; +}; + +static DEFINE_PER_CPU(struct shadow_if, shadow_if); + +static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) +{ + return hweight16(shadow_if->lr_map & (BIT(idx) - 1)); +} + +/* + * Nesting GICv3 support + * + * On a non-nesting VM (only running at EL0/EL1), the host hypervisor + * completely controls the interrupts injected via the list registers. + * Consequently, most of the state that is modified by the guest (by ACK-ing + * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we + * keep a semi-consistent view of the interrupts. + * + * This still applies for a NV guest, but only while "InHost" (either + * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}. + * + * When running a L2 guest ("not InHost"), things are radically different, + * as the L1 guest is in charge of provisioning the interrupts via its own + * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR + * page. This means that the flow described above does work (there is no + * state to rebuild in the L0 hypervisor), and that most things happed on L2 + * load/put: + * + * - on L2 load: move the in-memory L1 vGIC configuration into a shadow, + * per-CPU data structure that is used to populate the actual LRs. This is + * an extra copy that we could avoid, but life is short. In the process, + * we remap any interrupt that has the HW bit set to the mapped interrupt + * on the host, should the host consider it a HW one. This allows the HW + * deactivation to take its course, such as for the timer. + * + * - on L2 put: perform the inverse transformation, so that the result of L2 + * running becomes visible to L1 in the VNCR-accessible registers. + * + * - there is nothing to do on L2 entry apart from enabling the vgic, as + * everything will have happened on load. However, this is the point where + * we detect that an interrupt targeting L1 and prepare the grand + * switcheroo. + * + * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate + * corresponding the L1 interrupt. The L0 active state will be cleared by + * the HW if the L1 interrupt was itself backed by a HW interrupt. + * + * Maintenance Interrupt (MI) management: + * + * Since the L2 guest runs the vgic in its full glory, MIs get delivered and + * used as a handover point between L2 and L1. + * + * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending, + * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to + * run and process the MI. + * + * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its + * state must be computed at each entry/exit of the guest, much like we do + * it for the PMU interrupt. + * + * - because most of the ICH_*_EL2 registers live in the VNCR page, the + * quality of emulation is poor: L1 can setup the vgic so that an MI would + * immediately fire, and not observe anything until the next exit. + * Similarly, a pending MI is not immediately disabled by clearing + * ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for + * example. + * + * System register emulation: + * + * We get two classes of registers: + * + * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access + * them, and L0 doesn't see a thing. + * + * - those that always trap (ELRSR, EISR, MISR): these are status registers + * that are built on the fly based on the in-memory state. + * + * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot, + * and a NV L2 would either access the VNCR page provided by L1 (memory + * based registers), or see the access redirected to L1 (registers that + * trap) thanks to NV being set by L1. + */ + +bool vgic_state_is_nested(struct kvm_vcpu *vcpu) +{ + u64 xmo; + + if (is_nested_ctxt(vcpu)) { + xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO); + WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO), + "Separate virtual IRQ/FIQ settings not supported\n"); + + return !!xmo; + } + + return false; +} + +static struct shadow_if *get_shadow_if(void) +{ + return this_cpu_ptr(&shadow_if); +} + +static bool lr_triggers_eoi(u64 lr) +{ + return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI); +} + +static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state) +{ + u16 eisr = 0, elrsr = 0; + bool pend = false; + + for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + + if (lr_triggers_eoi(lr)) + eisr |= BIT(i); + if (!(lr & ICH_LR_STATE)) + elrsr |= BIT(i); + pend |= (lr & ICH_LR_PENDING_BIT); + } + + mi_state->eisr = eisr; + mi_state->elrsr = elrsr; + mi_state->pend = pend; +} + +u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + + vgic_compute_mi_state(vcpu, &mi_state); + return mi_state.eisr; +} + +u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + + vgic_compute_mi_state(vcpu, &mi_state); + return mi_state.elrsr; +} + +u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + u64 reg = 0, hcr, vmcr; + + hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); + + vgic_compute_mi_state(vcpu, &mi_state); + + if (mi_state.eisr) + reg |= ICH_MISR_EL2_EOI; + + if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) { + int used_lrs = kvm_vgic_global_state.nr_lr; + + used_lrs -= hweight16(mi_state.elrsr); + reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0; + } + + if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr)) + reg |= ICH_MISR_EL2_LRENP; + + if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend) + reg |= ICH_MISR_EL2_NP; + + if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK)) + reg |= ICH_MISR_EL2_VGrp0E; + + if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK)) + reg |= ICH_MISR_EL2_VGrp0D; + + if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK)) + reg |= ICH_MISR_EL2_VGrp1E; + + if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK)) + reg |= ICH_MISR_EL2_VGrp1D; + + return reg; +} + +static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr) +{ + struct vgic_irq *irq; + + if (!(lr & ICH_LR_HW)) + return lr; + + /* We have the HW bit set, check for validity of pINTID */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + /* If there was no real mapping, nuke the HW bit */ + if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI) + lr &= ~ICH_LR_HW; + + /* Translate the virtual mapping to the real one, even if invalid */ + if (irq) { + lr &= ~ICH_LR_PHYS_ID_MASK; + lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + vgic_put_irq(vcpu->kvm, irq); + } + + return lr; +} + +/* + * For LRs which have HW bit set such as timer interrupts, we modify them to + * have the host hardware interrupt number instead of the virtual one programmed + * by the guest hypervisor. + */ +static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, + struct vgic_v3_cpu_if *s_cpu_if) +{ + struct shadow_if *shadow_if; + + shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif); + shadow_if->lr_map = 0; + + for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + + if (!(lr & ICH_LR_STATE)) + continue; + + lr = translate_lr_pintid(vcpu, lr); + + s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr; + shadow_if->lr_map |= BIT(i); + } + + s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); +} + +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2); +} + +void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + int i; + + for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { + u64 val, host_lr, lr; + + host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); + + /* Propagate the new LR state */ + lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + val = lr & ~ICH_LR_STATE; + val |= host_lr & ICH_LR_STATE; + __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); + + /* + * Deactivation of a HW interrupt: the LR must have the HW + * bit set, have been in a non-invalid state before the run, + * and now be in an invalid state. If any of that doesn't + * hold, we're done with this LR. + */ + if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) && + !(host_lr & ICH_LR_STATE))) + continue; + + /* + * If we had a HW lr programmed by the guest hypervisor, we + * need to emulate the HW effect between the guest hypervisor + * and the nested guest. + */ + vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + } + + /* We need these to be synchronised to generate the MI */ + __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2)); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount); + + write_sysreg_s(0, SYS_ICH_HCR_EL2); + isb(); + + vgic_v3_nested_update_mi(vcpu); +} + +static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, + struct vgic_v3_cpu_if *s_cpu_if) +{ + struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; + int i; + + s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); + s_cpu_if->vgic_sre = host_if->vgic_sre; + + for (i = 0; i < 4; i++) { + s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i)); + s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i)); + } + + vgic_v3_create_shadow_lr(vcpu, s_cpu_if); +} + +void vgic_v3_load_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif; + + BUG_ON(!vgic_state_is_nested(vcpu)); + + vgic_v3_create_shadow_state(vcpu, cpu_if); + + __vgic_v3_restore_vmcr_aprs(cpu_if); + __vgic_v3_activate_traps(cpu_if); + + for (int i = 0; i < cpu_if->used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); + + /* + * Propagate the number of used LRs for the benefit of the HYP + * GICv3 emulation code. Yes, this is a pretty sorry hack. + */ + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs; +} + +void vgic_v3_put_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; + int i; + + __vgic_v3_save_aprs(s_cpu_if); + + for (i = 0; i < 4; i++) { + __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]); + __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]); + } + + for (i = 0; i < s_cpu_if->used_lrs; i++) + __gic_v3_set_lr(0, i); + + __vgic_v3_deactivate_traps(s_cpu_if); + + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; +} + +/* + * If we exit a L2 VM with a pending maintenance interrupt from the GIC, + * then we need to forward this to L1 so that it can re-sync the appropriate + * LRs and sample level triggered interrupts again. + */ +void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu) +{ + bool state = read_sysreg_s(SYS_ICH_MISR_EL2); + + /* This will force a switch back to L1 if the level is high */ + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + vcpu->kvm->arch.vgic.mi_intid, state, vcpu); + + sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); +} + +void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu) +{ + bool level; + + level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu); + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + vcpu->kvm->arch.vgic.mi_intid, level, vcpu); +} diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c new file mode 100644 index 000000000000..1d6dd1b545bd --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -0,0 +1,1009 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/kstrtox.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/string_choices.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_asm.h> + +#include "vgic-mmio.h" +#include "vgic.h" + +static bool group0_trap; +static bool group1_trap; +static bool common_trap; +static bool dir_trap; +static bool gicv4_enable; + +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) +{ + struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + cpuif->vgic_hcr = ICH_HCR_EL2_En; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; + + if (!als->nr_sgi) + cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG0_MASK) ? + ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ? + ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; + + /* + * Dealing with EOImode=1 is a massive source of headache. Not + * only do we need to track that we have active interrupts + * outside of the LRs and force DIR to be trapped, we also + * need to deal with SPIs that can be deactivated on another + * CPU. + * + * On systems that do not implement TDIR, force the bit in the + * shadow state anyway to avoid IPI-ing on these poor sods. + * + * Note that we set the trap irrespective of EOIMode, as that + * can change behind our back without any warning... + */ + if (!cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) || + irqs_active_outside_lrs(als) || + atomic_read(&vcpu->kvm->arch.vgic.active_spis)) + cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR; +} + +static bool lr_signals_eoi_mi(u64 lr_val) +{ + return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) && + !(lr_val & ICH_LR_HW); +} + +static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) +{ + struct vgic_irq *irq; + bool is_v2_sgi = false; + bool deactivated; + u32 intid; + + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + intid = val & ICH_LR_VIRTUAL_ID_MASK; + } else { + intid = val & GICH_LR_VIRTUALID; + is_v2_sgi = vgic_irq_is_sgi(intid); + } + + irq = vgic_get_vcpu_irq(vcpu, intid); + if (!irq) /* An LPI could have been unmapped. */ + return; + + scoped_guard(raw_spinlock, &irq->irq_lock) { + /* Always preserve the active bit for !LPIs, note deactivation */ + if (irq->intid >= VGIC_MIN_LPI) + val &= ~ICH_LR_ACTIVE_BIT; + deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); + irq->active = !!(val & ICH_LR_ACTIVE_BIT); + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & ICH_LR_PENDING_BIT)) + irq->pending_latch = true; + + /* + * Clear soft pending state when level irqs have been acked. + */ + if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) + irq->pending_latch = false; + + if (is_v2_sgi) { + u8 cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val); + + if (irq->active) + irq->active_source = cpuid; + + if (val & ICH_LR_PENDING_BIT) + irq->source |= BIT(cpuid); + } + + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); + + irq->on_lr = false; + } + + /* Notify fds when the guest EOI'ed a level-triggered SPI, and drop the refcount */ + if (deactivated && lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) { + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + atomic_dec_if_positive(&vcpu->kvm->arch.vgic.active_spis); + } + + vgic_put_irq(vcpu->kvm, irq); +} + +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + +static void vgic_v3_deactivate_phys(u32 intid) +{ + if (cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) + gic_insn(intid | FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, 1), CDDI); + else + gic_write_dir(intid); +} + +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr); + struct vgic_irq *irq; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + for (int lr = 0; lr < cpuif->used_lrs; lr++) + vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]); + + /* + * EOIMode=0: use EOIcount to emulate deactivation. We are + * guaranteed to deactivate in reverse order of the activation, so + * just pick one active interrupt after the other in the ap_list, + * and replay the deactivation as if the CPU was doing it. We also + * rely on priority drop to have taken place, and the list to be + * sorted by priority. + */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u64 lr; + + /* + * I would have loved to write this using a scoped_guard(), + * but using 'continue' here is a total train wreck. + */ + if (!eoicount) { + break; + } else { + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; + } + + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + + vgic_v3_fold_lr(vcpu, lr); + eoicount--; + } + + cpuif->used_lrs = 0; +} + +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false, is_v2_sgi; + struct vgic_irq *irq; + unsigned long flags; + u64 lr = 0; + u8 cpuid; + + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); + + is_v2_sgi = (model == KVM_DEV_TYPE_ARM_VGIC_V2 && + val < VGIC_NR_SGIS); + + /* + * We only deal with DIR when EOIMode==1, and only for SGI, + * PPI or SPI. + */ + if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) || + val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* + * EOIMode=1: we must rely on traps to handle deactivate of + * overflowing interrupts, as there is no ordering guarantee and + * EOIcount isn't being incremented. Priority drop will have taken + * place, as ICV_EOIxR_EL1 only affects the APRs and not the LRs. + * + * Three possibities: + * + * - The irq is not queued on any CPU, and there is nothing to + * do, + * + * - Or the irq is in an LR, meaning that its state is not + * directly observable. Treat it bluntly by making it as if + * this was a write to GICD_ICACTIVER, which will force an + * exit on all vcpus. If it hurts, don't do that. + * + * - Or the irq is active, but not in an LR, and we can + * directly deactivate it by building a pseudo-LR, fold it, + * and queue a request to prune the resulting ap_list, + * + * Special care must be taken to match the source CPUID when + * deactivating a GICv2 SGI. + */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; + + /* + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. + */ + if (irq->on_lr) { + mmio = true; + goto put; + } + + /* GICv2 SGI: check that the cpuid matches */ + if (is_v2_sgi && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } + + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; + } + + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + + vgic_v3_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); +} + +/* Requires the irq to be locked already */ +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) +{ + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u64 val = irq->intid; + bool allow_pending = true, is_v2_sgi; + + WARN_ON(irq->on_lr); + + is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && + model == KVM_DEV_TYPE_ARM_VGIC_V2); + + if (irq->active) { + val |= ICH_LR_ACTIVE_BIT; + if (is_v2_sgi) + val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; + if (vgic_irq_is_multi_sgi(irq)) { + allow_pending = false; + val |= ICH_LR_EOI; + } + } + + if (irq->hw && !vgic_irq_needs_resampling(irq)) { + val |= ICH_LR_HW; + val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active) + allow_pending = false; + } else { + if (irq->config == VGIC_CONFIG_LEVEL) { + val |= ICH_LR_EOI; + + /* + * Software resampling doesn't work very well + * if we allow P+A, so let's not do that. + */ + if (irq->active) + allow_pending = false; + } + } + + if (allow_pending && irq_is_pending(irq)) { + val |= ICH_LR_PENDING_BIT; + + if (is_v2_sgi) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return 0; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + if (irq->source & ~BIT(src - 1)) + val |= ICH_LR_EOI; + } + } + + if (irq->group) + val |= ICH_LR_GROUP; + + val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; + + return val; +} + +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u64 val = vgic_v3_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + + if (val & ICH_LR_PENDING_BIT) { + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch = false; + + if (vgic_irq_is_sgi(irq->intid) && + model == KVM_DEV_TYPE_ARM_VGIC_V2) { + u32 src = ffs(irq->source); + + irq->source &= ~BIT(src - 1); + if (irq->source) + irq->pending_latch = true; + } + } + + /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v3_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) + irq->line_level = false; + + irq->on_lr = true; +} + +void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; +} + +void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u32 vmcr; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { + vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) & + ICH_VMCR_ACK_CTL_MASK; + vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) & + ICH_VMCR_FIQ_EN_MASK; + } else { + /* + * When emulating GICv3 on GICv3 with SRE=1 on the + * VFIQEn bit is RES1 and the VAckCtl bit is RES0. + */ + vmcr = ICH_VMCR_FIQ_EN_MASK; + } + + vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; + vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; + vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; + vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; + vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; + vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; + vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; + + cpu_if->vgic_vmcr = vmcr; +} + +void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { + vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >> + ICH_VMCR_ACK_CTL_SHIFT; + vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >> + ICH_VMCR_FIQ_EN_SHIFT; + } else { + /* + * When emulating GICv3 on GICv3 with SRE=1 on the + * VFIQEn bit is RES1 and the VAckCtl bit is RES0. + */ + vmcrp->fiqen = 1; + vmcrp->ackctl = 0; + } + + vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; + vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT; + vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; + vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; + vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; + vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; +} + +#define INITIAL_PENDBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) + +void vgic_v3_reset(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vgic_v3->vgic_vmcr = 0; + + /* + * If we are emulating a GICv3, we do it in an non-GICv2-compatible + * way, so we force SRE to 1 to demonstrate this to the guest. + * Also, we don't support any form of IRQ/FIQ bypass. + * This goes with the spec allowing the value to be RAO/WI. + */ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB | + ICC_SRE_EL1_DFB | + ICC_SRE_EL1_SRE); + vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; + } else { + vgic_v3->vgic_sre = 0; + } + + vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits, + kvm_vgic_global_state.ich_vtr_el2); + vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits, + kvm_vgic_global_state.ich_vtr_el2) + 1; +} + +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (!vgic_is_v3(vcpu->kvm)) + return; + + /* Hide GICv3 sysreg if necessary */ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 || + !irqchip_in_kernel(vcpu->kvm)) + vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | + ICH_HCR_EL2_TC); +} + +int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) +{ + struct kvm_vcpu *vcpu; + int byte_offset, bit_nr; + gpa_t pendbase, ptr; + bool status; + u8 val; + int ret; + unsigned long flags; + +retry: + vcpu = irq->target_vcpu; + if (!vcpu) + return 0; + + pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + + byte_offset = irq->intid / BITS_PER_BYTE; + bit_nr = irq->intid % BITS_PER_BYTE; + ptr = pendbase + byte_offset; + + ret = kvm_read_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + + status = val & (1 << bit_nr); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->target_vcpu != vcpu) { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + goto retry; + } + irq->pending_latch = status; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + if (status) { + /* clear consumed data */ + val &= ~(1 << bit_nr); + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + } + return 0; +} + +/* + * The deactivation of the doorbell interrupt will trigger the + * unmapping of the associated vPE. + */ +static void unmap_all_vpes(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int i; + + for (i = 0; i < dist->its_vm.nr_vpes; i++) + free_irq(dist->its_vm.vpes[i]->irq, kvm_get_vcpu(kvm, i)); +} + +static void map_all_vpes(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int i; + + for (i = 0; i < dist->its_vm.nr_vpes; i++) + WARN_ON(vgic_v4_request_vpe_irq(kvm_get_vcpu(kvm, i), + dist->its_vm.vpes[i]->irq)); +} + +/* + * vgic_v3_save_pending_tables - Save the pending tables into guest RAM + * kvm lock and all vcpu lock must be held + */ +int vgic_v3_save_pending_tables(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + gpa_t last_ptr = ~(gpa_t)0; + bool vlpi_avail = false; + unsigned long index; + int ret = 0; + u8 val; + + if (unlikely(!vgic_initialized(kvm))) + return -ENXIO; + + /* + * A preparation for getting any VLPI states. + * The above vgic initialized check also ensures that the allocation + * and enabling of the doorbells have already been done. + */ + if (kvm_vgic_global_state.has_gicv4_1) { + unmap_all_vpes(kvm); + vlpi_avail = true; + } + + xa_for_each(&dist->lpi_xa, index, irq) { + int byte_offset, bit_nr; + struct kvm_vcpu *vcpu; + gpa_t pendbase, ptr; + bool is_pending; + bool stored; + + vcpu = irq->target_vcpu; + if (!vcpu) + continue; + + pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + + byte_offset = irq->intid / BITS_PER_BYTE; + bit_nr = irq->intid % BITS_PER_BYTE; + ptr = pendbase + byte_offset; + + if (ptr != last_ptr) { + ret = kvm_read_guest_lock(kvm, ptr, &val, 1); + if (ret) + goto out; + last_ptr = ptr; + } + + stored = val & (1U << bit_nr); + + is_pending = irq->pending_latch; + + if (irq->hw && vlpi_avail) + vgic_v4_get_vlpi_state(irq, &is_pending); + + if (stored == is_pending) + continue; + + if (is_pending) + val |= 1 << bit_nr; + else + val &= ~(1 << bit_nr); + + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); + if (ret) + goto out; + } + +out: + if (vlpi_avail) + map_all_vpes(kvm); + + return ret; +} + +/** + * vgic_v3_rdist_overlap - check if a region overlaps with any + * existing redistributor region + * + * @kvm: kvm handle + * @base: base of the region + * @size: size of region + * + * Return: true if there is an overlap + */ +bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size) +{ + struct vgic_dist *d = &kvm->arch.vgic; + struct vgic_redist_region *rdreg; + + list_for_each_entry(rdreg, &d->rd_regions, list) { + if ((base + size > rdreg->base) && + (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg))) + return true; + } + return false; +} + +/* + * Check for overlapping regions and for regions crossing the end of memory + * for base addresses which have already been set. + */ +bool vgic_v3_check_base(struct kvm *kvm) +{ + struct vgic_dist *d = &kvm->arch.vgic; + struct vgic_redist_region *rdreg; + + if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && + d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) + return false; + + list_for_each_entry(rdreg, &d->rd_regions, list) { + size_t sz = vgic_v3_rd_region_size(kvm, rdreg); + + if (vgic_check_iorange(kvm, VGIC_ADDR_UNDEF, + rdreg->base, SZ_64K, sz)) + return false; + } + + if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base)) + return true; + + return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base, + KVM_VGIC_V3_DIST_SIZE); +} + +/** + * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one + * which has free space to put a new rdist region. + * + * @rd_regions: redistributor region list head + * + * A redistributor regions maps n redistributors, n = region size / (2 x 64kB). + * Stride between redistributors is 0 and regions are filled in the index order. + * + * Return: the redist region handle, if any, that has space to map a new rdist + * region. + */ +struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions) +{ + struct vgic_redist_region *rdreg; + + list_for_each_entry(rdreg, rd_regions, list) { + if (!vgic_v3_redist_region_full(rdreg)) + return rdreg; + } + return NULL; +} + +struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, + u32 index) +{ + struct list_head *rd_regions = &kvm->arch.vgic.rd_regions; + struct vgic_redist_region *rdreg; + + list_for_each_entry(rdreg, rd_regions, list) { + if (rdreg->index == index) + return rdreg; + } + return NULL; +} + + +int vgic_v3_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + unsigned long c; + + kvm_for_each_vcpu(c, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { + kvm_debug("vcpu %ld redistributor base not set\n", c); + return -ENXIO; + } + } + + if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { + kvm_debug("Need to set vgic distributor addresses first\n"); + return -ENXIO; + } + + if (!vgic_v3_check_base(kvm)) { + kvm_debug("VGIC redist and dist frames overlap\n"); + return -EINVAL; + } + + /* + * For a VGICv3 we require the userland to explicitly initialize + * the VGIC before we need to use it. + */ + if (!vgic_initialized(kvm)) { + return -EBUSY; + } + + if (kvm_vgic_global_state.has_gicv4_1) + vgic_v4_configure_vsgis(kvm); + + return 0; +} + +DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); +DEFINE_STATIC_KEY_FALSE(vgic_v3_has_v2_compat); + +static int __init early_group0_trap_cfg(char *buf) +{ + return kstrtobool(buf, &group0_trap); +} +early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); + +static int __init early_group1_trap_cfg(char *buf) +{ + return kstrtobool(buf, &group1_trap); +} +early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); + +static int __init early_common_trap_cfg(char *buf) +{ + return kstrtobool(buf, &common_trap); +} +early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); + +static int __init early_gicv4_enable(char *buf) +{ + return kstrtobool(buf, &gicv4_enable); +} +early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); + +static const struct midr_range broken_seis[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, +}; + +static bool vgic_v3_broken_seis(void) +{ + return (is_kernel_in_hyp_mode() && + is_midr_in_range_list(broken_seis) && + (read_sysreg_s(SYS_ICH_VTR_EL2) & ICH_VTR_EL2_SEIS)); +} + +void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, + int nr_inst) +{ + u32 insn, oinsn, rd; + u64 hcr = 0; + + if (cpus_have_cap(ARM64_WORKAROUND_CAVIUM_30115)) { + group0_trap = true; + group1_trap = true; + } + + if (vgic_v3_broken_seis()) { + /* We know that these machines have ICH_HCR_EL2.TDIR */ + group0_trap = true; + group1_trap = true; + dir_trap = true; + } + + if (!cpus_have_cap(ARM64_HAS_ICH_HCR_EL2_TDIR)) + common_trap = true; + + if (group0_trap) + hcr |= ICH_HCR_EL2_TALL0; + if (group1_trap) + hcr |= ICH_HCR_EL2_TALL1; + if (common_trap) + hcr |= ICH_HCR_EL2_TC; + if (dir_trap) + hcr |= ICH_HCR_EL2_TDIR; + + /* Compute target register */ + oinsn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn); + + /* movz rd, #(val & 0xffff) */ + insn = aarch64_insn_gen_movewide(rd, + (u16)hcr, + 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr = cpu_to_le32(insn); +} + +/** + * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller + * @info: pointer to the GIC description + * + * Returns 0 if the VGICv3 has been probed successfully, returns an error code + * otherwise + */ +int vgic_v3_probe(const struct gic_kvm_info *info) +{ + u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); + bool has_v2; + u64 traps; + int ret; + + has_v2 = ich_vtr_el2 >> 63; + ich_vtr_el2 = (u32)ich_vtr_el2; + + /* + * The ListRegs field is 5 bits, but there is an architectural + * maximum of 16 list registers. Just ignore bit 4... + */ + kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; + kvm_vgic_global_state.can_emulate_gicv2 = false; + kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; + + /* GICv4 support? */ + if (info->has_v4) { + kvm_vgic_global_state.has_gicv4 = gicv4_enable; + kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; + kvm_info("GICv4%s support %s\n", + kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", + str_enabled_disabled(gicv4_enable)); + } + + kvm_vgic_global_state.vcpu_base = 0; + + if (!info->vcpu.start) { + kvm_info("GICv3: no GICV resource entry\n"); + } else if (!has_v2) { + pr_warn(FW_BUG "CPU interface incapable of MMIO access\n"); + } else if (!PAGE_ALIGNED(info->vcpu.start)) { + pr_warn("GICV physical address 0x%llx not page aligned\n", + (unsigned long long)info->vcpu.start); + } else if (kvm_get_mode() != KVM_MODE_PROTECTED) { + kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.can_emulate_gicv2 = true; + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device.\n"); + return ret; + } + kvm_info("vgic-v2@%llx\n", info->vcpu.start); + } + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); + if (ret) { + kvm_err("Cannot register GICv3 KVM device.\n"); + kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); + return ret; + } + + if (kvm_vgic_global_state.vcpu_base == 0) + kvm_info("disabling GICv2 emulation\n"); + + /* + * Flip the static branch if the HW supports v2, even if we're + * not using it (such as in protected mode). + */ + if (has_v2) + static_branch_enable(&vgic_v3_has_v2_compat); + + if (vgic_v3_broken_seis()) { + kvm_info("GICv3 with broken locally generated SEI\n"); + kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; + } + + traps = vgic_ich_hcr_trap_bits(); + if (traps) { + kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", + (traps & ICH_HCR_EL2_TALL0) ? "G0" : "", + (traps & ICH_HCR_EL2_TALL1) ? "G1" : "", + (traps & ICH_HCR_EL2_TC) ? "C" : "", + (traps & ICH_HCR_EL2_TDIR) ? "D" : ""); + static_branch_enable(&vgic_v3_cpuif_trap); + } + + kvm_vgic_global_state.vctrl_base = NULL; + kvm_vgic_global_state.type = VGIC_V3; + kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; + + return 0; +} + +void vgic_v3_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + /* If the vgic is nested, perform the full state loading */ + if (vgic_state_is_nested(vcpu)) { + vgic_v3_load_nested(vcpu); + return; + } + + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); + + if (has_vhe()) + __vgic_v3_activate_traps(cpu_if); + + WARN_ON(vgic_v4_load(vcpu)); +} + +void vgic_v3_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + if (vgic_state_is_nested(vcpu)) { + vgic_v3_put_nested(vcpu); + return; + } + + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); + WARN_ON(vgic_v4_put(vcpu)); + + if (has_vhe()) + __vgic_v3_deactivate_traps(cpu_if); +} diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c new file mode 100644 index 000000000000..09c3e9eb23f8 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/kvm_host.h> +#include <linux/irqchip/arm-gic-v3.h> + +#include "vgic.h" + +/* + * How KVM uses GICv4 (insert rude comments here): + * + * The vgic-v4 layer acts as a bridge between several entities: + * - The GICv4 ITS representation offered by the ITS driver + * - VFIO, which is in charge of the PCI endpoint + * - The virtual ITS, which is the only thing the guest sees + * + * The configuration of VLPIs is triggered by a callback from VFIO, + * instructing KVM that a PCI device has been configured to deliver + * MSIs to a vITS. + * + * kvm_vgic_v4_set_forwarding() is thus called with the routing entry, + * and this is used to find the corresponding vITS data structures + * (ITS instance, device, event and irq) using a process that is + * extremely similar to the injection of an MSI. + * + * At this stage, we can link the guest's view of an LPI (uniquely + * identified by the routing entry) and the host irq, using the GICv4 + * driver mapping operation. Should the mapping succeed, we've then + * successfully upgraded the guest's LPI to a VLPI. We can then start + * with updating GICv4's view of the property table and generating an + * INValidation in order to kickstart the delivery of this VLPI to the + * guest directly, without software intervention. Well, almost. + * + * When the PCI endpoint is deconfigured, this operation is reversed + * with VFIO calling kvm_vgic_v4_unset_forwarding(). + * + * Once the VLPI has been mapped, it needs to follow any change the + * guest performs on its LPI through the vITS. For that, a number of + * command handlers have hooks to communicate these changes to the HW: + * - Any invalidation triggers a call to its_prop_update_vlpi() + * - The INT command results in a irq_set_irqchip_state(), which + * generates an INT on the corresponding VLPI. + * - The CLEAR command results in a irq_set_irqchip_state(), which + * generates an CLEAR on the corresponding VLPI. + * - DISCARD translates into an unmap, similar to a call to + * kvm_vgic_v4_unset_forwarding(). + * - MOVI is translated by an update of the existing mapping, changing + * the target vcpu, resulting in a VMOVI being generated. + * - MOVALL is translated by a string of mapping updates (similar to + * the handling of MOVI). MOVALL is horrible. + * + * Note that a DISCARD/MAPTI sequence emitted from the guest without + * reprogramming the PCI endpoint after MAPTI does not result in a + * VLPI being mapped, as there is no callback from VFIO (the guest + * will get the interrupt via the normal SW injection). Fixing this is + * not trivial, and requires some horrible messing with the VFIO + * internals. Not fun. Don't do that. + * + * Then there is the scheduling. Each time a vcpu is about to run on a + * physical CPU, KVM must tell the corresponding redistributor about + * it. And if we've migrated our vcpu from one CPU to another, we must + * tell the ITS (so that the messages reach the right redistributor). + * This is done in two steps: first issue a irq_set_affinity() on the + * irq corresponding to the vcpu, then call its_make_vpe_resident(). + * You must be in a non-preemptible context. On exit, a call to + * its_make_vpe_non_resident() tells the redistributor that we're done + * with the vcpu. + * + * Finally, the doorbell handling: Each vcpu is allocated an interrupt + * which will fire each time a VLPI is made pending whilst the vcpu is + * not running. Each time the vcpu gets blocked, the doorbell + * interrupt gets enabled. When the vcpu is unblocked (for whatever + * reason), the doorbell interrupt is disabled. + */ + +#define DB_IRQ_FLAGS (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING) + +static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info) +{ + struct kvm_vcpu *vcpu = info; + + /* We got the message, no need to fire again */ + if (!kvm_vgic_global_state.has_gicv4_1 && + !irqd_irq_disabled(&irq_to_desc(irq)->irq_data)) + disable_irq_nosync(irq); + + /* + * The v4.1 doorbell can fire concurrently with the vPE being + * made non-resident. Ensure we only update pending_last + * *after* the non-residency sequence has completed. + */ + raw_spin_lock(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vpe_lock); + vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true; + raw_spin_unlock(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vpe_lock); + + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return IRQ_HANDLED; +} + +static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq) +{ + vpe->sgi_config[irq->intid].enabled = irq->enabled; + vpe->sgi_config[irq->intid].group = irq->group; + vpe->sgi_config[irq->intid].priority = irq->priority; +} + +static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + int i; + + /* + * With GICv4.1, every virtual SGI can be directly injected. So + * let's pretend that they are HW interrupts, tied to a host + * IRQ. The SGI code will do its magic. + */ + for (i = 0; i < VGIC_NR_SGIS; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); + struct irq_desc *desc; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw) + goto unlock; + + irq->hw = true; + irq->host_irq = irq_find_mapping(vpe->sgi_domain, i); + + /* Transfer the full irq state to the vPE */ + vgic_v4_sync_sgi_config(vpe, irq); + desc = irq_to_desc(irq->host_irq); + ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc), + false); + if (!WARN_ON(ret)) { + /* Transfer pending state */ + ret = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + irq->pending_latch); + WARN_ON(ret); + irq->pending_latch = false; + } + unlock: + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < VGIC_NR_SGIS; i++) { + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); + struct irq_desc *desc; + unsigned long flags; + bool pending; + int ret; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (!irq->hw) + goto unlock; + + irq->hw = false; + ret = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &pending); + WARN_ON(ret); + + irq->pending_latch = pending; + + desc = irq_to_desc(irq->host_irq); + irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); + unlock: + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_v4_configure_vsgis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->arch.config_lock); + + kvm_arm_halt_guest(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (dist->nassgireq) + vgic_v4_enable_vsgis(vcpu); + else + vgic_v4_disable_vsgis(vcpu); + } + + kvm_arm_resume_guest(kvm); +} + +/* + * Must be called with GICv4.1 and the vPE unmapped, which + * indicates the invalidation of any VPT caches associated + * with the vPE, thus we can get the VLPI state by peeking + * at the VPT. + */ +void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val) +{ + struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + int mask = BIT(irq->intid % BITS_PER_BYTE); + void *va; + u8 *ptr; + + va = page_address(vpe->vpt_page); + ptr = va + irq->intid / BITS_PER_BYTE; + + *val = !!(*ptr & mask); +} + +int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq) +{ + return request_irq(irq, vgic_v4_doorbell_handler, 0, "vcpu", vcpu); +} + +/** + * vgic_v4_init - Initialize the GICv4 data structures + * @kvm: Pointer to the VM being initialized + * + * We may be called each time a vITS is created, or when the + * vgic is initialized. In both cases, the number of vcpus + * should now be fixed. + */ +int vgic_v4_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int nr_vcpus, ret; + unsigned long i; + + lockdep_assert_held(&kvm->arch.config_lock); + + if (!kvm_vgic_global_state.has_gicv4) + return 0; /* Nothing to see here... move along. */ + + if (dist->its_vm.vpes) + return 0; + + nr_vcpus = atomic_read(&kvm->online_vcpus); + + dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes), + GFP_KERNEL_ACCOUNT); + if (!dist->its_vm.vpes) + return -ENOMEM; + + dist->its_vm.nr_vpes = nr_vcpus; + + kvm_for_each_vcpu(i, vcpu, kvm) + dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + ret = its_alloc_vcpu_irqs(&dist->its_vm); + if (ret < 0) { + kvm_err("VPE IRQ allocation failure\n"); + kfree(dist->its_vm.vpes); + dist->its_vm.nr_vpes = 0; + dist->its_vm.vpes = NULL; + return ret; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + int irq = dist->its_vm.vpes[i]->irq; + unsigned long irq_flags = DB_IRQ_FLAGS; + + /* + * Don't automatically enable the doorbell, as we're + * flipping it back and forth when the vcpu gets + * blocked. Also disable the lazy disabling, as the + * doorbell could kick us out of the guest too + * early... + * + * On GICv4.1, the doorbell is managed in HW and must + * be left enabled. + */ + if (kvm_vgic_global_state.has_gicv4_1) + irq_flags &= ~IRQ_NOAUTOEN; + irq_set_status_flags(irq, irq_flags); + + ret = vgic_v4_request_vpe_irq(vcpu, irq); + if (ret) { + kvm_err("failed to allocate vcpu IRQ%d\n", irq); + /* + * Trick: adjust the number of vpes so we know + * how many to nuke on teardown... + */ + dist->its_vm.nr_vpes = i; + break; + } + } + + if (ret) + vgic_v4_teardown(kvm); + + return ret; +} + +/** + * vgic_v4_teardown - Free the GICv4 data structures + * @kvm: Pointer to the VM being destroyed + */ +void vgic_v4_teardown(struct kvm *kvm) +{ + struct its_vm *its_vm = &kvm->arch.vgic.its_vm; + int i; + + lockdep_assert_held(&kvm->arch.config_lock); + + if (!its_vm->vpes) + return; + + for (i = 0; i < its_vm->nr_vpes; i++) { + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i); + int irq = its_vm->vpes[i]->irq; + + irq_clear_status_flags(irq, DB_IRQ_FLAGS); + free_irq(irq, vcpu); + } + + its_free_vcpu_irqs(its_vm); + kfree(its_vm->vpes); + its_vm->nr_vpes = 0; + its_vm->vpes = NULL; +} + +static inline bool vgic_v4_want_doorbell(struct kvm_vcpu *vcpu) +{ + if (vcpu_get_flag(vcpu, IN_WFI)) + return true; + + if (likely(!vcpu_has_nv(vcpu))) + return false; + + /* + * GICv4 hardware is only ever used for the L1. Mark the vPE (i.e. the + * L1 context) nonresident and request a doorbell to kick us out of the + * L2 when an IRQ becomes pending. + */ + return vcpu_get_flag(vcpu, IN_NESTED_ERET); +} + +int vgic_v4_put(struct kvm_vcpu *vcpu) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + if (!vgic_supports_direct_irqs(vcpu->kvm) || !vpe->resident) + return 0; + + return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu)); +} + +int vgic_v4_load(struct kvm_vcpu *vcpu) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + int err; + + if (!vgic_supports_direct_irqs(vcpu->kvm) || vpe->resident) + return 0; + + if (vcpu_get_flag(vcpu, IN_WFI)) + return 0; + + /* + * Before making the VPE resident, make sure the redistributor + * corresponding to our current CPU expects us here. See the + * doc in drivers/irqchip/irq-gic-v4.c to understand how this + * turns into a VMOVP command at the ITS level. + */ + err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id())); + if (err) + return err; + + err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled); + if (err) + return err; + + /* + * Now that the VPE is resident, let's get rid of a potential + * doorbell interrupt that would still be pending. This is a + * GICv4.0 only "feature"... + */ + if (!kvm_vgic_global_state.has_gicv4_1) + err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false); + + return err; +} + +void vgic_v4_commit(struct kvm_vcpu *vcpu) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + /* + * No need to wait for the vPE to be ready across a shallow guest + * exit, as only a vcpu_put will invalidate it. + */ + if (!vpe->ready) + its_commit_vpe(vpe); +} + +static struct vgic_its *vgic_get_its(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct kvm_msi msi = (struct kvm_msi) { + .address_lo = irq_entry->msi.address_lo, + .address_hi = irq_entry->msi.address_hi, + .data = irq_entry->msi.data, + .flags = irq_entry->msi.flags, + .devid = irq_entry->msi.devid, + }; + + return vgic_msi_to_its(kvm, &msi); +} + +int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct vgic_its *its; + struct vgic_irq *irq; + struct its_vlpi_map map; + unsigned long flags; + int ret = 0; + + if (!vgic_supports_direct_msis(kvm)) + return 0; + + /* + * Get the ITS, and escape early on error (not a valid + * doorbell for any of our vITSs). + */ + its = vgic_get_its(kvm, irq_entry); + if (IS_ERR(its)) + return 0; + + guard(mutex)(&its->its_lock); + + /* + * Perform the actual DevID/EventID -> LPI translation. + * + * Silently exit if translation fails as the guest (or userspace!) has + * managed to do something stupid. Emulated LPI injection will still + * work if the guest figures itself out at a later time. + */ + if (vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq)) + return 0; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* Silently exit if the vLPI is already mapped */ + if (irq->hw) + goto out_unlock_irq; + + /* + * Emit the mapping request. If it fails, the ITS probably + * isn't v4 compatible, so let's silently bail out. Holding + * the ITS lock should ensure that nothing can modify the + * target vcpu. + */ + map = (struct its_vlpi_map) { + .vm = &kvm->arch.vgic.its_vm, + .vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe, + .vintid = irq->intid, + .properties = ((irq->priority & 0xfc) | + (irq->enabled ? LPI_PROP_ENABLED : 0) | + LPI_PROP_GROUP1), + .db_enabled = true, + }; + + ret = its_map_vlpi(virq, &map); + if (ret) + goto out_unlock_irq; + + irq->hw = true; + irq->host_irq = virq; + atomic_inc(&map.vpe->vlpi_count); + + /* Transfer pending state */ + if (!irq->pending_latch) + goto out_unlock_irq; + + ret = irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, + irq->pending_latch); + WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq); + + /* + * Clear pending_latch and communicate this state + * change via vgic_queue_irq_unlock. + */ + irq->pending_latch = false; + vgic_queue_irq_unlock(kvm, irq, flags); + return ret; + +out_unlock_irq: + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + return ret; +} + +static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq) +{ + struct vgic_irq *irq; + unsigned long idx; + + guard(rcu)(); + xa_for_each(&kvm->arch.vgic.lpi_xa, idx, irq) { + if (!irq->hw || irq->host_irq != host_irq) + continue; + + if (!vgic_try_get_irq_ref(irq)) + return NULL; + + return irq; + } + + return NULL; +} + +void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) +{ + struct vgic_irq *irq; + unsigned long flags; + + if (!vgic_supports_direct_msis(kvm)) + return; + + irq = __vgic_host_irq_get_vlpi(kvm, host_irq); + if (!irq) + return; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + WARN_ON(irq->hw && irq->host_irq != host_irq); + if (irq->hw) { + atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); + irq->hw = false; + its_unmap_vlpi(host_irq); + } + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(kvm, irq); +} diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c new file mode 100644 index 000000000000..2d3811f4e117 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <kvm/arm_vgic.h> +#include <linux/irqchip/arm-vgic-info.h> + +#include "vgic.h" + +/* + * Probe for a vGICv5 compatible interrupt controller, returning 0 on success. + * Currently only supports GICv3-based VMs on a GICv5 host, and hence only + * registers a VGIC_V3 device. + */ +int vgic_v5_probe(const struct gic_kvm_info *info) +{ + u64 ich_vtr_el2; + int ret; + + if (!cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) + return -ENODEV; + + kvm_vgic_global_state.type = VGIC_V5; + kvm_vgic_global_state.has_gcie_v3_compat = true; + + /* We only support v3 compat mode - use vGICv3 limits */ + kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; + + kvm_vgic_global_state.vcpu_base = 0; + kvm_vgic_global_state.vctrl_base = NULL; + kvm_vgic_global_state.can_emulate_gicv2 = false; + kvm_vgic_global_state.has_gicv4 = false; + kvm_vgic_global_state.has_gicv4_1 = false; + + ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); + kvm_vgic_global_state.ich_vtr_el2 = (u32)ich_vtr_el2; + + /* + * The ListRegs field is 5 bits, but there is an architectural + * maximum of 16 list registers. Just ignore bit 4... + */ + kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; + + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); + if (ret) { + kvm_err("Cannot register GICv3-legacy KVM device.\n"); + return ret; + } + + static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); + kvm_info("GCIE legacy system register CPU interface\n"); + + return 0; +} diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c new file mode 100644 index 000000000000..430aa98888fd --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic.c @@ -0,0 +1,1222 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/list_sort.h> +#include <linux/nospec.h> + +#include <asm/kvm_hyp.h> + +#include "vgic.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +struct vgic_global kvm_vgic_global_state __ro_after_init = { + .gicv3_cpuif = STATIC_KEY_FALSE_INIT, +}; + +/* + * Locking order is always: + * kvm->lock (mutex) + * vcpu->mutex (mutex) + * kvm->arch.config_lock (mutex) + * its->cmd_lock (mutex) + * its->its_lock (mutex) + * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_cpu->ap_list_lock must be taken with IRQs disabled + * vgic_irq->irq_lock must be taken with IRQs disabled + * + * As the ap_list_lock might be taken from the timer interrupt handler, + * we have to disable IRQs before taking this lock and everything lower + * than it. + * + * The config_lock has additional ordering requirements: + * kvm->slots_lock + * kvm->srcu + * kvm->arch.config_lock + * + * If you need to take multiple locks, always take the upper lock first, + * then the lower ones, e.g. first take the its_lock, then the irq_lock. + * If you are already holding a lock and need to take a higher one, you + * have to drop the lower ranking lock first and re-acquire it after having + * taken the upper one. + * + * When taking more than one ap_list_lock at the same time, always take the + * lowest numbered VCPU's ap_list_lock first, so: + * vcpuX->vcpu_id < vcpuY->vcpu_id: + * raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); + * raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); + * + * Since the VGIC must support injecting virtual interrupts from ISRs, we have + * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer + * spinlocks for any lock that may be taken while injecting an interrupt. + */ + +/* + * Index the VM's xarray of mapped LPIs and return a reference to the IRQ + * structure. The caller is expected to call vgic_put_irq() later once it's + * finished with the IRQ. + */ +static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = NULL; + + rcu_read_lock(); + + irq = xa_load(&dist->lpi_xa, intid); + if (!vgic_try_get_irq_ref(irq)) + irq = NULL; + + rcu_read_unlock(); + + return irq; +} + +/* + * This looks up the virtual interrupt ID to get the corresponding + * struct vgic_irq. It also increases the refcount, so any caller is expected + * to call vgic_put_irq() once it's finished with this IRQ. + */ +struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid) +{ + /* SPIs */ + if (intid >= VGIC_NR_PRIVATE_IRQS && + intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { + intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); + return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; + } + + /* LPIs */ + if (intid >= VGIC_MIN_LPI) + return vgic_get_lpi(kvm, intid); + + return NULL; +} + +struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) +{ + if (WARN_ON(!vcpu)) + return NULL; + + /* SGIs and PPIs */ + if (intid < VGIC_NR_PRIVATE_IRQS) { + intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS); + return &vcpu->arch.vgic_cpu.private_irqs[intid]; + } + + return vgic_get_irq(vcpu->kvm, intid); +} + +static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq) +{ + lockdep_assert_held(&dist->lpi_xa.xa_lock); + __xa_erase(&dist->lpi_xa, irq->intid); + kfree_rcu(irq, rcu); +} + +static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return false; + + return refcount_dec_and_test(&irq->refcount); +} + +static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq) +{ + if (!__vgic_put_irq(kvm, irq)) + return false; + + irq->pending_release = true; + return true; +} + +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long flags; + + /* + * Normally the lock is only taken when the refcount drops to 0. + * Acquire/release it early on lockdep kernels to make locking issues + * in rare release paths a bit more obvious. + */ + if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) { + guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock); + } + + if (!__vgic_put_irq(kvm, irq)) + return; + + xa_lock_irqsave(&dist->lpi_xa, flags); + vgic_release_lpi_locked(dist, irq); + xa_unlock_irqrestore(&dist->lpi_xa, flags); +} + +static void vgic_release_deleted_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long flags, intid; + struct vgic_irq *irq; + + xa_lock_irqsave(&dist->lpi_xa, flags); + + xa_for_each(&dist->lpi_xa, intid, irq) { + if (irq->pending_release) + vgic_release_lpi_locked(dist, irq); + } + + xa_unlock_irqrestore(&dist->lpi_xa, flags); +} + +void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq, *tmp; + bool deleted = false; + unsigned long flags; + + raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); + + list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { + if (irq->intid >= VGIC_MIN_LPI) { + raw_spin_lock(&irq->irq_lock); + list_del(&irq->ap_list); + irq->vcpu = NULL; + raw_spin_unlock(&irq->irq_lock); + deleted |= vgic_put_irq_norelease(vcpu->kvm, irq); + } + } + + raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + + if (deleted) + vgic_release_deleted_lpis(vcpu->kvm); +} + +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) +{ + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + pending)); +} + +bool vgic_get_phys_line_level(struct vgic_irq *irq) +{ + bool line_level; + + BUG_ON(!irq->hw); + + if (irq->ops && irq->ops->get_input_level) + return irq->ops->get_input_level(irq->intid); + + WARN_ON(irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &line_level)); + return line_level; +} + +/* Set/Clear the physical active state */ +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) +{ + + BUG_ON(!irq->hw); + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_ACTIVE, + active)); +} + +/** + * vgic_target_oracle - compute the target vcpu for an irq + * + * @irq: The irq to route. Must be already locked. + * + * Based on the current state of the interrupt (enabled, pending, + * active, vcpu and target_vcpu), compute the next vcpu this should be + * given to. Return NULL if this shouldn't be injected at all. + * + * Requires the IRQ lock to be held. + */ +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) +{ + lockdep_assert_held(&irq->irq_lock); + + /* If the interrupt is active, it must stay on the current vcpu */ + if (irq->active) + return irq->vcpu ? : irq->target_vcpu; + + /* + * If the IRQ is not active but enabled and pending, we should direct + * it to its configured target VCPU. + * If the distributor is disabled, pending interrupts shouldn't be + * forwarded. + */ + if (irq->enabled && irq_is_pending(irq)) { + if (unlikely(irq->target_vcpu && + !irq->target_vcpu->kvm->arch.vgic.enabled)) + return NULL; + + return irq->target_vcpu; + } + + /* If neither active nor pending and enabled, then this IRQ should not + * be queued to any VCPU. + */ + return NULL; +} + +struct vgic_sort_info { + struct kvm_vcpu *vcpu; + struct vgic_vmcr vmcr; +}; + +/* + * The order of items in the ap_lists defines how we'll pack things in LRs as + * well, the first items in the list being the first things populated in the + * LRs. + * + * Pending, non-active interrupts must be placed at the head of the list. + * Otherwise things should be sorted by the priority field and the GIC + * hardware support will take care of preemption of priority groups etc. + * Interrupts that are not deliverable should be at the end of the list. + * + * Return negative if "a" sorts before "b", 0 to preserve order, and positive + * to sort "b" before "a". + */ +static int vgic_irq_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); + struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); + struct vgic_sort_info *info = priv; + struct kvm_vcpu *vcpu = info->vcpu; + bool penda, pendb; + int ret; + + /* + * list_sort may call this function with the same element when + * the list is fairly long. + */ + if (unlikely(irqa == irqb)) + return 0; + + raw_spin_lock(&irqa->irq_lock); + raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); + + /* Undeliverable interrupts should be last */ + ret = (int)(vgic_target_oracle(irqb) == vcpu) - (int)(vgic_target_oracle(irqa) == vcpu); + if (ret) + goto out; + + /* Same thing for interrupts targeting a disabled group */ + ret = (int)(irqb->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + ret -= (int)(irqa->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + if (ret) + goto out; + + penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active; + pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active; + + ret = (int)pendb - (int)penda; + if (ret) + goto out; + + /* Both pending and enabled, sort by priority (lower number first) */ + ret = (int)irqa->priority - (int)irqb->priority; + if (ret) + goto out; + + /* Finally, HW bit active interrupts have priority over non-HW ones */ + ret = (int)irqb->hw - (int)irqa->hw; + +out: + raw_spin_unlock(&irqb->irq_lock); + raw_spin_unlock(&irqa->irq_lock); + return ret; +} + +/* Must be called with the ap_list_lock held */ +static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_sort_info info = { .vcpu = vcpu, }; + + lockdep_assert_held(&vgic_cpu->ap_list_lock); + + vgic_get_vmcr(vcpu, &info.vmcr); + list_sort(&info, &vgic_cpu->ap_list_head, vgic_irq_cmp); +} + +/* + * Only valid injection if changing level for level-triggered IRQs or for a + * rising edge, and in-kernel connected IRQ lines can only be controlled by + * their owner. + */ +static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner) +{ + if (irq->owner != owner) + return false; + + switch (irq->config) { + case VGIC_CONFIG_LEVEL: + return irq->line_level != level; + case VGIC_CONFIG_EDGE: + return level; + } + + return false; +} + +static bool vgic_model_needs_bcst_kick(struct kvm *kvm) +{ + /* + * A GICv3 (or GICv3-like) system exposing a GICv3 to the guest + * needs a broadcast kick to set TDIR globally. + * + * For systems that do not have TDIR (ARM's own v8.0 CPUs), the + * shadow TDIR bit is always set, and so is the register's TC bit, + * so no need to kick the CPUs. + */ + return (cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); +} + +/* + * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. + * Do the queuing if necessary, taking the right locks in the right order. + * Returns true when the IRQ was queued, false otherwise. + * + * Needs to be entered with the IRQ lock already held, but will return + * with all locks dropped. + */ +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags) __releases(&irq->irq_lock) +{ + struct kvm_vcpu *vcpu; + bool bcast; + + lockdep_assert_held(&irq->irq_lock); + +retry: + vcpu = vgic_target_oracle(irq); + if (irq->vcpu || !vcpu) { + /* + * If this IRQ is already on a VCPU's ap_list, then it + * cannot be moved or modified and there is no more work for + * us to do. + * + * Otherwise, if the irq is not pending and enabled, it does + * not need to be inserted into an ap_list and there is also + * no more work for us to do. + */ + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + /* + * We have to kick the VCPU here, because we could be + * queueing an edge-triggered interrupt for which we + * get no EOI maintenance interrupt. In that case, + * while the IRQ is already on the VCPU's AP list, the + * VCPU could have EOI'ed the original interrupt and + * won't see this one until it exits for some other + * reason. + */ + if (vcpu) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } + return false; + } + + /* + * We must unlock the irq lock to take the ap_list_lock where + * we are going to insert this new pending interrupt. + */ + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + /* someone can do stuff here, which we re-check below */ + + raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags); + raw_spin_lock(&irq->irq_lock); + + /* + * Did something change behind our backs? + * + * There are two cases: + * 1) The irq lost its pending state or was disabled behind our + * backs and/or it was queued to another VCPU's ap_list. + * 2) Someone changed the affinity on this irq behind our + * backs and we are now holding the wrong ap_list_lock. + * + * In both cases, drop the locks and retry. + */ + + if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, + flags); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + goto retry; + } + + /* + * Grab a reference to the irq to reflect the fact that it is + * now in the ap_list. This is safe as the caller must already hold a + * reference on the irq. + */ + vgic_get_irq_ref(irq); + list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); + irq->vcpu = vcpu; + + /* A new SPI may result in deactivation trapping on all vcpus */ + bcast = (vgic_model_needs_bcst_kick(vcpu->kvm) && + vgic_valid_spi(vcpu->kvm, irq->intid) && + atomic_fetch_inc(&vcpu->kvm->arch.vgic.active_spis) == 0); + + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); + + if (!bcast) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } else { + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_IRQ_PENDING); + } + + return true; +} + +/** + * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic + * @kvm: The VM structure pointer + * @vcpu: The CPU for PPIs or NULL for global interrupts + * @intid: The INTID to inject a new state to. + * @level: Edge-triggered: true: to trigger the interrupt + * false: to ignore the call + * Level-sensitive true: raise the input signal + * false: lower the input signal + * @owner: The opaque pointer to the owner of the IRQ being raised to verify + * that the caller is allowed to inject this IRQ. Userspace + * injections will have owner == NULL. + * + * The VGIC is not concerned with devices being active-LOW or active-HIGH for + * level-sensitive interrupts. You can think of the level parameter as 1 + * being HIGH and 0 being LOW and all devices being active-HIGH. + */ +int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int intid, bool level, void *owner) +{ + struct vgic_irq *irq; + unsigned long flags; + int ret; + + ret = vgic_lazy_init(kvm); + if (ret) + return ret; + + if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + trace_vgic_update_irq_pending(vcpu ? vcpu->vcpu_idx : 0, intid, level); + + if (intid < VGIC_NR_PRIVATE_IRQS) + irq = vgic_get_vcpu_irq(vcpu, intid); + else + irq = vgic_get_irq(kvm, intid); + if (!irq) + return -EINVAL; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (!vgic_validate_injection(irq, level, owner)) { + /* Nothing to see here, move along... */ + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(kvm, irq); + return 0; + } + + if (irq->config == VGIC_CONFIG_LEVEL) + irq->line_level = level; + else + irq->pending_latch = true; + + vgic_queue_irq_unlock(kvm, irq, flags); + vgic_put_irq(kvm, irq); + + return 0; +} + +/* @irq->irq_lock must be held */ +static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + unsigned int host_irq, + struct irq_ops *ops) +{ + struct irq_desc *desc; + struct irq_data *data; + + /* + * Find the physical IRQ number corresponding to @host_irq + */ + desc = irq_to_desc(host_irq); + if (!desc) { + kvm_err("%s: no interrupt descriptor\n", __func__); + return -EINVAL; + } + data = irq_desc_get_irq_data(desc); + while (data->parent_data) + data = data->parent_data; + + irq->hw = true; + irq->host_irq = host_irq; + irq->hwintid = data->hwirq; + irq->ops = ops; + return 0; +} + +/* @irq->irq_lock must be held */ +static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) +{ + irq->hw = false; + irq->hwintid = 0; + irq->ops = NULL; +} + +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, + u32 vintid, struct irq_ops *ops) +{ + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); + unsigned long flags; + int ret; + + BUG_ON(!irq); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + ret = kvm_vgic_map_irq(vcpu, irq, host_irq, ops); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + return ret; +} + +/** + * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ + * @vcpu: The VCPU pointer + * @vintid: The INTID of the interrupt + * + * Reset the active and pending states of a mapped interrupt. Kernel + * subsystems injecting mapped interrupts should reset their interrupt lines + * when we are doing a reset of the VM. + */ +void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid) +{ + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); + unsigned long flags; + + if (!irq->hw) + goto out; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->active = false; + irq->pending_latch = false; + irq->line_level = false; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); +out: + vgic_put_irq(vcpu->kvm, irq); +} + +int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) +{ + struct vgic_irq *irq; + unsigned long flags; + + if (!vgic_initialized(vcpu->kvm)) + return -EAGAIN; + + irq = vgic_get_vcpu_irq(vcpu, vintid); + BUG_ON(!irq); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + kvm_vgic_unmap_irq(irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + return 0; +} + +int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid) +{ + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); + unsigned long flags; + int ret = -1; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw) + ret = irq->hwintid; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + return ret; +} + +/** + * kvm_vgic_set_owner - Set the owner of an interrupt for a VM + * + * @vcpu: Pointer to the VCPU (used for PPIs) + * @intid: The virtual INTID identifying the interrupt (PPI or SPI) + * @owner: Opaque pointer to the owner + * + * Returns 0 if intid is not already used by another in-kernel device and the + * owner is set, otherwise returns an error code. + */ +int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) +{ + struct vgic_irq *irq; + unsigned long flags; + int ret = 0; + + if (!vgic_initialized(vcpu->kvm)) + return -EAGAIN; + + /* SGIs and LPIs cannot be wired up to any device */ + if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid)) + return -EINVAL; + + irq = vgic_get_vcpu_irq(vcpu, intid); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->owner && irq->owner != owner) + ret = -EEXIST; + else + irq->owner = owner; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + return ret; +} + +/** + * vgic_prune_ap_list - Remove non-relevant interrupts from the list + * + * @vcpu: The VCPU pointer + * + * Go over the list of "interesting" interrupts, and prune those that we + * won't have to consider in the near future. + */ +static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq, *tmp; + bool deleted_lpis = false; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + +retry: + raw_spin_lock(&vgic_cpu->ap_list_lock); + + list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { + struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; + bool target_vcpu_needs_kick = false; + + raw_spin_lock(&irq->irq_lock); + + BUG_ON(vcpu != irq->vcpu); + + target_vcpu = vgic_target_oracle(irq); + + if (!target_vcpu) { + /* + * We don't need to process this interrupt any + * further, move it off the list. + */ + list_del(&irq->ap_list); + irq->vcpu = NULL; + raw_spin_unlock(&irq->irq_lock); + + /* + * This vgic_put_irq call matches the + * vgic_get_irq_ref in vgic_queue_irq_unlock, + * where we added the LPI to the ap_list. As + * we remove the irq from the list, we drop + * also drop the refcount. + */ + deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq); + continue; + } + + if (target_vcpu == vcpu) { + /* We're on the right CPU */ + raw_spin_unlock(&irq->irq_lock); + continue; + } + + /* This interrupt looks like it has to be migrated. */ + + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&vgic_cpu->ap_list_lock); + + /* + * Ensure locking order by always locking the smallest + * ID first. + */ + if (vcpu->vcpu_id < target_vcpu->vcpu_id) { + vcpuA = vcpu; + vcpuB = target_vcpu; + } else { + vcpuA = target_vcpu; + vcpuB = vcpu; + } + + raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); + raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, + SINGLE_DEPTH_NESTING); + raw_spin_lock(&irq->irq_lock); + + /* + * If the affinity has been preserved, move the + * interrupt around. Otherwise, it means things have + * changed while the interrupt was unlocked, and we + * need to replay this. + * + * In all cases, we cannot trust the list not to have + * changed, so we restart from the beginning. + */ + if (target_vcpu == vgic_target_oracle(irq)) { + struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu; + + list_del(&irq->ap_list); + irq->vcpu = target_vcpu; + list_add_tail(&irq->ap_list, &new_cpu->ap_list_head); + target_vcpu_needs_kick = true; + } + + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); + raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); + + if (target_vcpu_needs_kick) { + kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu); + kvm_vcpu_kick(target_vcpu); + } + + goto retry; + } + + raw_spin_unlock(&vgic_cpu->ap_list_lock); + + if (unlikely(deleted_lpis)) + vgic_release_deleted_lpis(vcpu->kvm); +} + +static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_fold_lr_state(vcpu); + else + vgic_v3_fold_lr_state(vcpu); +} + +/* Requires the irq_lock to be held. */ +static inline void vgic_populate_lr(struct kvm_vcpu *vcpu, + struct vgic_irq *irq, int lr) +{ + lockdep_assert_held(&irq->irq_lock); + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_populate_lr(vcpu, irq, lr); + else + vgic_v3_populate_lr(vcpu, irq, lr); +} + +static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_clear_lr(vcpu, lr); + else + vgic_v3_clear_lr(vcpu, lr); +} + +static void summarize_ap_list(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + + lockdep_assert_held(&vgic_cpu->ap_list_lock); + + *als = (typeof(*als)){}; + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + guard(raw_spinlock)(&irq->irq_lock); + + if (unlikely(vgic_target_oracle(irq) != vcpu)) + continue; + + if (!irq->active) + als->nr_pend++; + else + als->nr_act++; + + if (irq->intid < VGIC_NR_SGIS) + als->nr_sgi++; + } +} + +/* + * Dealing with LR overflow is close to black magic -- dress accordingly. + * + * We have to present an almost infinite number of interrupts through a very + * limited number of registers. Therefore crucial decisions must be made to + * ensure we feed the most relevant interrupts into the LRs, and yet have + * some facilities to let the guest interact with those that are not there. + * + * All considerations below are in the context of interrupts targeting a + * single vcpu with non-idle state (either pending, active, or both), + * colloquially called the ap_list: + * + * - Pending interrupts must have priority over active interrupts. This also + * excludes pending+active interrupts. This ensures that a guest can + * perform priority drops on any number of interrupts, and yet be + * presented the next pending one. + * + * - Deactivation of interrupts outside of the LRs must be tracked by using + * either the EOIcount-driven maintenance interrupt, and sometimes by + * trapping the DIR register. + * + * - For EOImode=0, a non-zero EOIcount means walking the ap_list past the + * point that made it into the LRs, and deactivate interrupts that would + * have made it onto the LRs if we had the space. + * + * - The MI-generation bits must be used to try and force an exit when the + * guest has done enough changes to the LRs that we want to reevaluate the + * situation: + * + * - if the total number of pending interrupts exceeds the number of + * LR, NPIE must be set in order to exit once no pending interrupts + * are present in the LRs, allowing us to populate the next batch. + * + * - if there are active interrupts outside of the LRs, then LRENPIE + * must be set so that we exit on deactivation of one of these, and + * work out which one is to be deactivated. Note that this is not + * enough to deal with EOImode=1, see below. + * + * - if the overall number of interrupts exceeds the number of LRs, + * then UIE must be set to allow refilling of the LRs once the + * majority of them has been processed. + * + * - as usual, MI triggers are only an optimisation, since we cannot + * rely on the MI being delivered in timely manner... + * + * - EOImode=1 creates some additional problems: + * + * - deactivation can happen in any order, and we cannot rely on + * EOImode=0's coupling of priority-drop and deactivation which + * imposes strict reverse Ack order. This means that DIR must + * trap if we have active interrupts outside of the LRs. + * + * - deactivation of SPIs can occur on any CPU, while the SPI is only + * present in the ap_list of the CPU that actually ack-ed it. In that + * case, EOIcount doesn't provide enough information, and we must + * resort to trapping DIR even if we don't overflow the LRs. Bonus + * point for not trapping DIR when no SPIs are pending or active in + * the whole VM. + * + * - LPIs do not suffer the same problem as SPIs on deactivation, as we + * have to essentially discard the active state, see below. + * + * - Virtual LPIs have an active state (surprise!), which gets removed on + * priority drop (EOI). However, EOIcount doesn't get bumped when the LPI + * is not present in the LR (surprise again!). Special care must therefore + * be taken to remove the active state from any activated LPI when exiting + * from the guest. This is in a way no different from what happens on the + * physical side. We still rely on the running priority to have been + * removed from the APRs, irrespective of the LPI being present in the LRs + * or not. + * + * - Virtual SGIs directly injected via GICv4.1 must not affect EOIcount, as + * they are not managed in SW and don't have a true active state. So only + * set vSGIEOICount when no SGIs are in the ap_list. + * + * - GICv2 SGIs with multiple sources are injected one source at a time, as + * if they were made pending sequentially. This may mean that we don't + * always present the HPPI if other interrupts with lower priority are + * pending in the LRs. Big deal. + */ +static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct ap_list_summary als; + struct vgic_irq *irq; + int count = 0; + + lockdep_assert_held(&vgic_cpu->ap_list_lock); + + summarize_ap_list(vcpu, &als); + + if (irqs_outside_lrs(&als)) + vgic_sort_ap_list(vcpu); + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + scoped_guard(raw_spinlock, &irq->irq_lock) { + if (likely(vgic_target_oracle(irq) == vcpu)) { + vgic_populate_lr(vcpu, irq, count++); + } + } + + if (count == kvm_vgic_global_state.nr_lr) + break; + } + + /* Nuke remaining LRs */ + for (int i = count ; i < kvm_vgic_global_state.nr_lr; i++) + vgic_clear_lr(vcpu, i); + + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { + vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count; + vgic_v2_configure_hcr(vcpu, &als); + } else { + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count; + vgic_v3_configure_hcr(vcpu, &als); + } +} + +static inline bool can_access_vgic_from_kernel(void) +{ + /* + * GICv2 can always be accessed from the kernel because it is + * memory-mapped, and VHE systems can access GICv3 EL2 system + * registers. + */ + return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe(); +} + +static inline void vgic_save_state(struct kvm_vcpu *vcpu) +{ + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v2_save_state(vcpu); + else + __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); +} + +/* Sync back the hardware VGIC state into our emulation after a guest's run. */ +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) +{ + /* If nesting, emulate the HW effect from L0 to L1 */ + if (vgic_state_is_nested(vcpu)) { + vgic_v3_sync_nested(vcpu); + return; + } + + if (vcpu_has_nv(vcpu)) + vgic_v3_nested_update_mi(vcpu); + + if (can_access_vgic_from_kernel()) + vgic_save_state(vcpu); + + vgic_fold_lr_state(vcpu); + vgic_prune_ap_list(vcpu); +} + +/* Sync interrupts that were deactivated through a DIR trap */ +void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + vgic_prune_ap_list(vcpu); + local_irq_restore(flags); +} + +static inline void vgic_restore_state(struct kvm_vcpu *vcpu) +{ + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v2_restore_state(vcpu); + else + __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); +} + +/* Flush our emulation state into the GIC hardware before entering the guest. */ +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) +{ + /* + * If in a nested state, we must return early. Two possibilities: + * + * - If we have any pending IRQ for the guest and the guest + * expects IRQs to be handled in its virtual EL2 mode (the + * virtual IMO bit is set) and it is not already running in + * virtual EL2 mode, then we have to emulate an IRQ + * exception to virtual EL2. + * + * We do that by placing a request to ourselves which will + * abort the entry procedure and inject the exception at the + * beginning of the run loop. + * + * - Otherwise, do exactly *NOTHING* apart from enabling the virtual + * CPU interface. The guest state is already loaded, and we can + * carry on with running it. + * + * If we have NV, but are not in a nested state, compute the + * maintenance interrupt state, as it may fire. + */ + if (vgic_state_is_nested(vcpu)) { + if (kvm_vgic_vcpu_pending_irq(vcpu)) + kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu); + + vgic_v3_flush_nested(vcpu); + return; + } + + if (vcpu_has_nv(vcpu)) + vgic_v3_nested_update_mi(vcpu); + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock) + vgic_flush_lr_state(vcpu); + + if (can_access_vgic_from_kernel()) + vgic_restore_state(vcpu); + + if (vgic_supports_direct_irqs(vcpu->kvm)) + vgic_v4_commit(vcpu); +} + +void kvm_vgic_load(struct kvm_vcpu *vcpu) +{ + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); + return; + } + + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v2_load(vcpu); + else + vgic_v3_load(vcpu); +} + +void kvm_vgic_put(struct kvm_vcpu *vcpu) +{ + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); + return; + } + + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v2_put(vcpu); + else + vgic_v3_put(vcpu); +} + +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + bool pending = false; + unsigned long flags; + struct vgic_vmcr vmcr; + + if (!vcpu->kvm->arch.vgic.enabled) + return false; + + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) + return true; + + vgic_get_vmcr(vcpu, &vmcr); + + raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + raw_spin_lock(&irq->irq_lock); + pending = irq_is_pending(irq) && irq->enabled && + !irq->active && + irq->priority < vmcr.pmr; + raw_spin_unlock(&irq->irq_lock); + + if (pending) + break; + } + + raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + + return pending; +} + +void vgic_kick_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long c; + + /* + * We've injected an interrupt, time to find out who deserves + * a good kick... + */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (kvm_vgic_vcpu_pending_irq(vcpu)) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } + } +} + +bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) +{ + struct vgic_irq *irq; + bool map_is_active; + unsigned long flags; + + if (!vgic_initialized(vcpu->kvm)) + return false; + + irq = vgic_get_vcpu_irq(vcpu, vintid); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + map_is_active = irq->hw && irq->active; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + return map_is_active; +} + +/* + * Level-triggered mapped IRQs are special because we only observe rising + * edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample the physical + * line and set the line level, because the device state could have changed + * or we simply need to process the still pending interrupt later. + * + * We could also have entered the guest with the interrupt active+pending. + * On the next exit, we need to re-evaluate the pending state, as it could + * otherwise result in a spurious interrupt by injecting a now potentially + * stale pending state. + * + * If this causes us to lower the level, we have to also clear the physical + * active state, since we will otherwise never be told when the interrupt + * becomes asserted again. + * + * Another case is when the interrupt requires a helping hand on + * deactivation (no HW deactivation, for example). + */ +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending) +{ + if (vgic_irq_is_mapped_level(irq)) { + bool resample = false; + + if (unlikely(vgic_irq_needs_resampling(irq))) { + resample = !(irq->active || irq->pending_latch); + } else if (lr_pending || (lr_deactivated && irq->line_level)) { + irq->line_level = vgic_get_phys_line_level(irq); + resample = !irq->line_level; + } + + if (resample) + vgic_irq_set_phys_active(irq, false); + } +} diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h new file mode 100644 index 000000000000..5f0fc96b4dc2 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic.h @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ +#ifndef __KVM_ARM_VGIC_NEW_H__ +#define __KVM_ARM_VGIC_NEW_H__ + +#include <linux/irqchip/arm-gic-common.h> +#include <asm/kvm_mmu.h> + +#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ +#define IMPLEMENTER_ARM 0x43b + +#define VGIC_ADDR_UNDEF (-1) +#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) + +#define INTERRUPT_ID_BITS_SPIS 10 +#define INTERRUPT_ID_BITS_ITS 16 +#define VGIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1) +#define VGIC_PRI_BITS 5 + +#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) + +#define VGIC_AFFINITY_0_SHIFT 0 +#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT) +#define VGIC_AFFINITY_1_SHIFT 8 +#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT) +#define VGIC_AFFINITY_2_SHIFT 16 +#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT) +#define VGIC_AFFINITY_3_SHIFT 24 +#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT) + +#define VGIC_AFFINITY_LEVEL(reg, level) \ + ((((reg) & VGIC_AFFINITY_## level ##_MASK) \ + >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) + +/* + * The Userspace encodes the affinity differently from the MPIDR, + * Below macro converts vgic userspace format to MPIDR reg format. + */ +#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \ + VGIC_AFFINITY_LEVEL(val, 1) | \ + VGIC_AFFINITY_LEVEL(val, 2) | \ + VGIC_AFFINITY_LEVEL(val, 3)) + +/* + * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst, + * below macros are defined for CPUREG encoding. + */ +#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 +#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14 +#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800 +#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11 +#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780 +#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7 +#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078 +#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3 +#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007 +#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0 + +#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) + +#define KVM_ICC_SRE_EL2 (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE | \ + ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB) +#define KVM_ICH_VTR_EL2_RES0 (ICH_VTR_EL2_DVIM | \ + ICH_VTR_EL2_A3V | \ + ICH_VTR_EL2_IDbits) +#define KVM_ICH_VTR_EL2_RES1 ICH_VTR_EL2_nV4 + +static inline u64 kvm_get_guest_vtr_el2(void) +{ + u64 vtr; + + vtr = kvm_vgic_global_state.ich_vtr_el2; + vtr &= ~KVM_ICH_VTR_EL2_RES0; + vtr |= KVM_ICH_VTR_EL2_RES1; + + return vtr; +} + +/* + * As per Documentation/virt/kvm/devices/arm-vgic-its.rst, + * below macros are defined for ITS table entry encoding. + */ +#define KVM_ITS_CTE_VALID_SHIFT 63 +#define KVM_ITS_CTE_VALID_MASK BIT_ULL(63) +#define KVM_ITS_CTE_RDBASE_SHIFT 16 +#define KVM_ITS_CTE_ICID_MASK GENMASK_ULL(15, 0) +#define KVM_ITS_ITE_NEXT_SHIFT 48 +#define KVM_ITS_ITE_PINTID_SHIFT 16 +#define KVM_ITS_ITE_PINTID_MASK GENMASK_ULL(47, 16) +#define KVM_ITS_ITE_ICID_MASK GENMASK_ULL(15, 0) +#define KVM_ITS_DTE_VALID_SHIFT 63 +#define KVM_ITS_DTE_VALID_MASK BIT_ULL(63) +#define KVM_ITS_DTE_NEXT_SHIFT 49 +#define KVM_ITS_DTE_NEXT_MASK GENMASK_ULL(62, 49) +#define KVM_ITS_DTE_ITTADDR_SHIFT 5 +#define KVM_ITS_DTE_ITTADDR_MASK GENMASK_ULL(48, 5) +#define KVM_ITS_DTE_SIZE_MASK GENMASK_ULL(4, 0) +#define KVM_ITS_L1E_VALID_MASK BIT_ULL(63) +/* we only support 64 kB translation table page size */ +#define KVM_ITS_L1E_ADDR_MASK GENMASK_ULL(51, 16) + +#define KVM_VGIC_V3_RDIST_INDEX_MASK GENMASK_ULL(11, 0) +#define KVM_VGIC_V3_RDIST_FLAGS_MASK GENMASK_ULL(15, 12) +#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT 12 +#define KVM_VGIC_V3_RDIST_BASE_MASK GENMASK_ULL(51, 16) +#define KVM_VGIC_V3_RDIST_COUNT_MASK GENMASK_ULL(63, 52) +#define KVM_VGIC_V3_RDIST_COUNT_SHIFT 52 + +#ifdef CONFIG_DEBUG_SPINLOCK +#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) +#else +#define DEBUG_SPINLOCK_BUG_ON(p) +#endif + +static inline u32 vgic_get_implementation_rev(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.vgic.implementation_rev; +} + +/* Requires the irq_lock to be held by the caller. */ +static inline bool irq_is_pending(struct vgic_irq *irq) +{ + if (irq->config == VGIC_CONFIG_EDGE) + return irq->pending_latch; + else + return irq->pending_latch || irq->line_level; +} + +static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq) +{ + return irq->config == VGIC_CONFIG_LEVEL && irq->hw; +} + +static inline int vgic_irq_get_lr_count(struct vgic_irq *irq) +{ + /* Account for the active state as an interrupt */ + if (vgic_irq_is_sgi(irq->intid) && irq->source) + return hweight8(irq->source) + irq->active; + + return irq_is_pending(irq) || irq->active; +} + +static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) +{ + return vgic_irq_get_lr_count(irq) > 1; +} + +static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, + const void *data, unsigned long len) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret; + + dist->table_write_in_progress = true; + ret = kvm_write_guest_lock(kvm, gpa, data, len); + dist->table_write_in_progress = false; + + return ret; +} + +void kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst); + +static inline u64 vgic_ich_hcr_trap_bits(void) +{ + u64 hcr; + + /* All the traps are in the bottom 16bits */ + asm volatile(ALTERNATIVE_CB("movz %0, #0\n", + ARM64_ALWAYS_SYSTEM, + kvm_compute_ich_hcr_trap_bits) + : "=r" (hcr)); + + return hcr; +} + +/* + * This struct provides an intermediate representation of the fields contained + * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC + * state to userspace can generate either GICv2 or GICv3 CPU interface + * registers regardless of the hardware backed GIC used. + */ +struct vgic_vmcr { + u32 grpen0; + u32 grpen1; + + u32 ackctl; + u32 fiqen; + u32 cbpr; + u32 eoim; + + u32 abpr; + u32 bpr; + u32 pmr; /* Priority mask field in the GICC_PMR and + * ICC_PMR_EL1 priority field format */ +}; + +struct vgic_reg_attr { + struct kvm_vcpu *vcpu; + gpa_t addr; +}; + +struct its_device { + struct list_head dev_list; + + /* the head for the list of ITTEs */ + struct list_head itt_head; + u32 num_eventid_bits; + gpa_t itt_addr; + u32 device_id; +}; + +#define COLLECTION_NOT_MAPPED ((u32)~0) + +struct its_collection { + struct list_head coll_list; + + u32 collection_id; + u32 target_addr; +}; + +#define its_is_collection_mapped(coll) ((coll) && \ + ((coll)->target_addr != COLLECTION_NOT_MAPPED)) + +struct its_ite { + struct list_head ite_list; + + struct vgic_irq *irq; + struct its_collection *collection; + u32 event_id; +}; + +struct ap_list_summary { + unsigned int nr_pend; /* purely pending, not active */ + unsigned int nr_act; /* active, or active+pending */ + unsigned int nr_sgi; /* any SGI */ +}; + +#define irqs_outside_lrs(s) \ + (((s)->nr_pend + (s)->nr_act) > kvm_vgic_global_state.nr_lr) + +#define irqs_pending_outside_lrs(s) \ + ((s)->nr_pend > kvm_vgic_global_state.nr_lr) + +#define irqs_active_outside_lrs(s) \ + ((s)->nr_act && irqs_outside_lrs(s)) + +int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr); +int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr); +const struct vgic_register_region * +vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, int len); +struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid); +struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid); +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq); +bool vgic_get_phys_line_level(struct vgic_irq *irq); +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags) __releases(&irq->irq_lock); +void vgic_kick_vcpus(struct kvm *kvm); +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending); + +int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, + phys_addr_t addr, phys_addr_t alignment, + phys_addr_t size); + +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val); +void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); +int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); +int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v2_reset(struct kvm_vcpu *vcpu); +int vgic_v2_probe(const struct gic_kvm_info *info); +int vgic_v2_map_resources(struct kvm *kvm); +int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, + enum vgic_type); + +void vgic_v2_init_lrs(void); +void vgic_v2_load(struct kvm_vcpu *vcpu); +void vgic_v2_put(struct kvm_vcpu *vcpu); + +void vgic_v2_save_state(struct kvm_vcpu *vcpu); +void vgic_v2_restore_state(struct kvm_vcpu *vcpu); + +static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq) +{ + if (!irq) + return false; + + if (irq->intid < VGIC_MIN_LPI) + return true; + + return refcount_inc_not_zero(&irq->refcount); +} + +static inline void vgic_get_irq_ref(struct vgic_irq *irq) +{ + WARN_ON_ONCE(!vgic_try_get_irq_ref(irq)); +} + +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val); +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); +void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v3_reset(struct kvm_vcpu *vcpu); +int vgic_v3_probe(const struct gic_kvm_info *info); +int vgic_v3_map_resources(struct kvm *kvm); +int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); +int vgic_v3_save_pending_tables(struct kvm *kvm); +int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count); +int vgic_register_redist_iodev(struct kvm_vcpu *vcpu); +void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu); +bool vgic_v3_check_base(struct kvm *kvm); + +void vgic_v3_load(struct kvm_vcpu *vcpu); +void vgic_v3_put(struct kvm_vcpu *vcpu); + +bool vgic_has_its(struct kvm *kvm); +int kvm_vgic_register_its_device(void); +void vgic_enable_lpis(struct kvm_vcpu *vcpu); +void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); +int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); +int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr, bool is_write); +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); +const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz); +int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u32 intid, u32 *val); +int kvm_register_vgic_device(unsigned long type); +void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +int vgic_lazy_init(struct kvm *kvm); +int vgic_init(struct kvm *kvm); + +void vgic_debug_init(struct kvm *kvm); +void vgic_debug_destroy(struct kvm *kvm); + +int vgic_v5_probe(const struct gic_kvm_info *info); + +static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; + + /* + * num_pri_bits are initialized with HW supported values. + * We can rely safely on num_pri_bits even if VM has not + * restored ICC_CTLR_EL1 before restoring APnR registers. + */ + switch (cpu_if->num_pri_bits) { + case 7: return 3; + case 6: return 1; + default: return 0; + } +} + +static inline bool +vgic_v3_redist_region_full(struct vgic_redist_region *region) +{ + if (!region->count) + return false; + + return (region->free_index >= region->count); +} + +struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs); + +static inline size_t +vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) +{ + if (!rdreg->count) + return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE; + else + return rdreg->count * KVM_VGIC_V3_REDIST_SIZE; +} + +struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, + u32 index); +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg); + +bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); + +static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) +{ + struct vgic_dist *d = &kvm->arch.vgic; + + return (base + size > d->vgic_dist_base) && + (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); +} + +bool vgic_lpis_enabled(struct kvm_vcpu *vcpu); +int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, struct vgic_irq **irq); +struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); +int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi); +void vgic_its_invalidate_all_caches(struct kvm *kvm); + +/* GICv4.1 MMIO interface */ +int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq); +int vgic_its_invall(struct kvm_vcpu *vcpu); + +bool system_supports_direct_sgis(void); +bool vgic_supports_direct_msis(struct kvm *kvm); +bool vgic_supports_direct_sgis(struct kvm *kvm); + +static inline bool vgic_supports_direct_irqs(struct kvm *kvm) +{ + return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm); +} + +int vgic_v4_init(struct kvm *kvm); +void vgic_v4_teardown(struct kvm *kvm); +void vgic_v4_configure_vsgis(struct kvm *kvm); +void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); +int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); + +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu); + +static inline bool kvm_has_gicv3(struct kvm *kvm) +{ + return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); +} + +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu); +void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); +void vgic_v3_load_nested(struct kvm_vcpu *vcpu); +void vgic_v3_put_nested(struct kvm_vcpu *vcpu); +void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu); +void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu); + +static inline bool vgic_is_v3_compat(struct kvm *kvm) +{ + return cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF) && + kvm_vgic_global_state.has_gcie_v3_compat; +} + +static inline bool vgic_is_v3(struct kvm *kvm) +{ + return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm); +} + +int vgic_its_debug_init(struct kvm_device *dev); +void vgic_its_debug_destroy(struct kvm_device *dev); + +#endif diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c new file mode 100644 index 000000000000..7fe8ba1a2851 --- /dev/null +++ b/arch/arm64/kvm/vmid.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * VMID allocator. + * + * Based on Arm64 ASID allocator algorithm. + * Please refer arch/arm64/mm/context.c for detailed + * comments on algorithm. + * + * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved. + * Copyright (C) 2012 ARM Ltd. + */ + +#include <linux/bitfield.h> +#include <linux/bitops.h> + +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> + +unsigned int __ro_after_init kvm_arm_vmid_bits; +static DEFINE_RAW_SPINLOCK(cpu_vmid_lock); + +static atomic64_t vmid_generation; +static unsigned long *vmid_map; + +static DEFINE_PER_CPU(atomic64_t, active_vmids); +static DEFINE_PER_CPU(u64, reserved_vmids); + +#define VMID_MASK (~GENMASK(kvm_arm_vmid_bits - 1, 0)) +#define VMID_FIRST_VERSION (1UL << kvm_arm_vmid_bits) + +#define NUM_USER_VMIDS VMID_FIRST_VERSION +#define vmid2idx(vmid) ((vmid) & ~VMID_MASK) +#define idx2vmid(idx) vmid2idx(idx) + +/* + * As vmid #0 is always reserved, we will never allocate one + * as below and can be treated as invalid. This is used to + * set the active_vmids on vCPU schedule out. + */ +#define VMID_ACTIVE_INVALID VMID_FIRST_VERSION + +#define vmid_gen_match(vmid) \ + (!(((vmid) ^ atomic64_read(&vmid_generation)) >> kvm_arm_vmid_bits)) + +static void flush_context(void) +{ + int cpu; + u64 vmid; + + bitmap_zero(vmid_map, NUM_USER_VMIDS); + + for_each_possible_cpu(cpu) { + vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0); + + /* Preserve reserved VMID */ + if (vmid == 0) + vmid = per_cpu(reserved_vmids, cpu); + __set_bit(vmid2idx(vmid), vmid_map); + per_cpu(reserved_vmids, cpu) = vmid; + } + + /* + * Unlike ASID allocator, we expect less frequent rollover in + * case of VMIDs. Hence, instead of marking the CPU as + * flush_pending and issuing a local context invalidation on + * the next context-switch, we broadcast TLB flush + I-cache + * invalidation over the inner shareable domain on rollover. + */ + kvm_call_hyp(__kvm_flush_vm_context); +} + +static bool check_update_reserved_vmid(u64 vmid, u64 newvmid) +{ + int cpu; + bool hit = false; + + /* + * Iterate over the set of reserved VMIDs looking for a match + * and update to use newvmid (i.e. the same VMID in the current + * generation). + */ + for_each_possible_cpu(cpu) { + if (per_cpu(reserved_vmids, cpu) == vmid) { + hit = true; + per_cpu(reserved_vmids, cpu) = newvmid; + } + } + + return hit; +} + +static u64 new_vmid(struct kvm_vmid *kvm_vmid) +{ + static u32 cur_idx = 1; + u64 vmid = atomic64_read(&kvm_vmid->id); + u64 generation = atomic64_read(&vmid_generation); + + if (vmid != 0) { + u64 newvmid = generation | (vmid & ~VMID_MASK); + + if (check_update_reserved_vmid(vmid, newvmid)) { + atomic64_set(&kvm_vmid->id, newvmid); + return newvmid; + } + + if (!__test_and_set_bit(vmid2idx(vmid), vmid_map)) { + atomic64_set(&kvm_vmid->id, newvmid); + return newvmid; + } + } + + vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, cur_idx); + if (vmid != NUM_USER_VMIDS) + goto set_vmid; + + /* We're out of VMIDs, so increment the global generation count */ + generation = atomic64_add_return_relaxed(VMID_FIRST_VERSION, + &vmid_generation); + flush_context(); + + /* We have more VMIDs than CPUs, so this will always succeed */ + vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, 1); + +set_vmid: + __set_bit(vmid, vmid_map); + cur_idx = vmid; + vmid = idx2vmid(vmid) | generation; + atomic64_set(&kvm_vmid->id, vmid); + return vmid; +} + +/* Called from vCPU sched out with preemption disabled */ +void kvm_arm_vmid_clear_active(void) +{ + atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); +} + +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) +{ + unsigned long flags; + u64 vmid, old_active_vmid; + + vmid = atomic64_read(&kvm_vmid->id); + + /* + * Please refer comments in check_and_switch_context() in + * arch/arm64/mm/context.c. + * + * Unlike ASID allocator, we set the active_vmids to + * VMID_ACTIVE_INVALID on vCPU schedule out to avoid + * reserving the VMID space needlessly on rollover. + * Hence explicitly check here for a "!= 0" to + * handle the sync with a concurrent rollover. + */ + old_active_vmid = atomic64_read(this_cpu_ptr(&active_vmids)); + if (old_active_vmid != 0 && vmid_gen_match(vmid) && + 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), + old_active_vmid, vmid)) + return; + + raw_spin_lock_irqsave(&cpu_vmid_lock, flags); + + /* Check that our VMID belongs to the current generation. */ + vmid = atomic64_read(&kvm_vmid->id); + if (!vmid_gen_match(vmid)) + vmid = new_vmid(kvm_vmid); + + atomic64_set(this_cpu_ptr(&active_vmids), vmid); + raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); +} + +/* + * Initialize the VMID allocator + */ +int __init kvm_arm_vmid_alloc_init(void) +{ + kvm_arm_vmid_bits = kvm_get_vmid_bits(); + + /* + * Expect allocation after rollover to fail if we don't have + * at least one more VMID than CPUs. VMID #0 is always reserved. + */ + WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus()); + atomic64_set(&vmid_generation, VMID_FIRST_VERSION); + vmid_map = bitmap_zalloc(NUM_USER_VMIDS, GFP_KERNEL); + if (!vmid_map) + return -ENOMEM; + + return 0; +} + +void __init kvm_arm_vmid_alloc_free(void) +{ + bitmap_free(vmid_map); +} |
