diff options
Diffstat (limited to 'arch/arm64/kvm')
94 files changed, 26127 insertions, 6387 deletions
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 05da3c8f7e88..4f803fd1c99a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -3,7 +3,6 @@ # KVM configuration # -source "virt/lib/Kconfig" source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION @@ -20,29 +19,25 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM - select MMU_NOTIFIER - select PREEMPT_NOTIFIERS + select KVM_COMMON + select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_MMU_NOTIFIER select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select KVM_XFER_TO_GUEST_WORK - select SRCU + select VIRT_XFER_TO_GUEST_WORK select KVM_VFIO - select HAVE_KVM_EVENTFD - select HAVE_KVM_IRQFD select HAVE_KVM_DIRTY_RING_ACQ_REL select NEED_KVM_DIRTY_RING_WITH_BITMAP select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING - select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS + select HAVE_KVM_READONLY_MEM select HAVE_KVM_VCPU_RUN_PID_CHANGE select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS - select INTERVAL_TREE + select KVM_GUEST_MEMFD help Support hosting virtualized guest machines. @@ -71,4 +66,21 @@ config PROTECTED_NVHE_STACKTRACE If unsure, or not using protected nVHE (pKVM), say N. +config PTDUMP_STAGE2_DEBUGFS + bool "Present the stage-2 pagetables to debugfs" + depends on KVM + depends on DEBUG_KERNEL + depends on DEBUG_FS + depends on ARCH_HAS_PTDUMP + select PTDUMP + default n + help + Say Y here if you want to show the stage-2 kernel pagetables + layout in a debugfs file. This information is only useful for kernel developers + who are working in architecture specific areas of the kernel. + It is probably not a good idea to enable this feature in a production + kernel. + + If in doubt, say N. + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 5e33c2d4645a..3ebc0570345c 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -3,26 +3,32 @@ # Makefile for Kernel-based Virtual Machine module # -ccflags-y += -I $(srctree)/$(src) +ccflags-y += -I $(src) include $(srctree)/virt/kvm/Makefile.kvm obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += hyp/ +CFLAGS_sys_regs.o += -Wno-override-init +CFLAGS_handle_exit.o += -Wno-override-init + kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ - inject_fault.o va_layout.o handle_exit.o \ + inject_fault.o va_layout.o handle_exit.o config.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ - arch_timer.o trng.o vmid.o \ + arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ - vgic/vgic-its.o vgic/vgic-debug.o + vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o \ + vgic/vgic-v5.o kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o +kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o always-y := hyp_constants.h hyp-constants.s @@ -30,7 +36,7 @@ define rule_gen_hyp_constants $(call filechk,offsets,__HYP_CONSTANTS_H__) endef -CFLAGS_hyp-constants.o = -I $(srctree)/$(src)/hyp/include +CFLAGS_hyp-constants.o = -I $(src)/hyp/include $(obj)/hyp-constants.s: $(src)/hyp/hyp-constants.c FORCE $(call if_changed_dep,cc_s_c) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index bb24a76b4224..99a07972068d 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -16,6 +16,7 @@ #include <asm/arch_timer.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_nested.h> #include <kvm/arm_vgic.h> #include <kvm/arm_arch_timer.h> @@ -29,15 +30,13 @@ static u32 host_vtimer_irq_flags; static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); +DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key); -static const struct kvm_irq_level default_ptimer_irq = { - .irq = 30, - .level = 1, -}; - -static const struct kvm_irq_level default_vtimer_irq = { - .irq = 27, - .level = 1, +static const u8 default_ppi[] = { + [TIMER_PTIMER] = 30, + [TIMER_VTIMER] = 27, + [TIMER_HPTIMER] = 26, + [TIMER_HVTIMER] = 28, }; static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); @@ -51,16 +50,33 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, struct arch_timer_context *timer, enum kvm_arch_timer_regs treg); +static bool kvm_arch_timer_get_input_level(int vintid); + +static struct irq_ops arch_timer_irq_ops = { + .get_input_level = kvm_arch_timer_get_input_level, +}; + +static int nr_timers(struct kvm_vcpu *vcpu) +{ + if (!vcpu_has_nv(vcpu)) + return NR_KVM_EL0_TIMERS; + + return NR_KVM_TIMERS; +} u32 timer_get_ctl(struct arch_timer_context *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: return __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); case TIMER_PTIMER: return __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + case TIMER_HVTIMER: + return __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2); + case TIMER_HPTIMER: + return __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2); default: WARN_ON(1); return 0; @@ -69,41 +85,39 @@ u32 timer_get_ctl(struct arch_timer_context *ctxt) u64 timer_get_cval(struct arch_timer_context *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: return __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); case TIMER_PTIMER: return __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + case TIMER_HVTIMER: + return __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2); + case TIMER_HPTIMER: + return __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); default: WARN_ON(1); return 0; } } -static u64 timer_get_offset(struct arch_timer_context *ctxt) -{ - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch(arch_timer_ctx_index(ctxt)) { - case TIMER_VTIMER: - return __vcpu_sys_reg(vcpu, CNTVOFF_EL2); - default: - return 0; - } -} - static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: - __vcpu_sys_reg(vcpu, CNTV_CTL_EL0) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTV_CTL_EL0, ctl); break; case TIMER_PTIMER: - __vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTP_CTL_EL0, ctl); + break; + case TIMER_HVTIMER: + __vcpu_assign_sys_reg(vcpu, CNTHV_CTL_EL2, ctl); + break; + case TIMER_HPTIMER: + __vcpu_assign_sys_reg(vcpu, CNTHP_CTL_EL2, ctl); break; default: WARN_ON(1); @@ -112,30 +126,23 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: - __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0) = cval; + __vcpu_assign_sys_reg(vcpu, CNTV_CVAL_EL0, cval); break; case TIMER_PTIMER: - __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval; + __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, cval); break; - default: - WARN_ON(1); - } -} - -static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset) -{ - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch(arch_timer_ctx_index(ctxt)) { - case TIMER_VTIMER: - __vcpu_sys_reg(vcpu, CNTVOFF_EL2) = offset; + case TIMER_HVTIMER: + __vcpu_assign_sys_reg(vcpu, CNTHV_CVAL_EL2, cval); + break; + case TIMER_HPTIMER: + __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, cval); break; default: - WARN(offset, "timer %ld\n", arch_timer_ctx_index(ctxt)); + WARN_ON(1); } } @@ -144,15 +151,29 @@ u64 kvm_phys_timer_read(void) return timecounter->cc->read(timecounter->cc); } -static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) +void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) { - if (has_vhe()) { + if (vcpu_has_nv(vcpu)) { + if (is_hyp_ctxt(vcpu)) { + map->direct_vtimer = vcpu_hvtimer(vcpu); + map->direct_ptimer = vcpu_hptimer(vcpu); + map->emul_vtimer = vcpu_vtimer(vcpu); + map->emul_ptimer = vcpu_ptimer(vcpu); + } else { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_vtimer = vcpu_hvtimer(vcpu); + map->emul_ptimer = vcpu_hptimer(vcpu); + } + } else if (has_vhe()) { map->direct_vtimer = vcpu_vtimer(vcpu); map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_vtimer = NULL; map->emul_ptimer = NULL; } else { map->direct_vtimer = vcpu_vtimer(vcpu); map->direct_ptimer = NULL; + map->emul_vtimer = NULL; map->emul_ptimer = vcpu_ptimer(vcpu); } @@ -161,8 +182,7 @@ static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) static inline bool userspace_irqchip(struct kvm *kvm) { - return static_branch_unlikely(&userspace_irqchip_in_use) && - unlikely(!irqchip_in_kernel(kvm)); + return unlikely(!irqchip_in_kernel(kvm)); } static void soft_timer_start(struct hrtimer *hrt, u64 ns) @@ -219,7 +239,7 @@ static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx, ns = cyclecounter_cyc2ns(timecounter->cc, val - now, timecounter->mask, - &timecounter->frac); + &timer_ctx->ns_frac); return ns; } @@ -247,8 +267,10 @@ static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu) static u64 wfit_delay_ns(struct kvm_vcpu *vcpu) { - struct arch_timer_context *ctx = vcpu_vtimer(vcpu); u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + struct arch_timer_context *ctx; + + ctx = is_hyp_ctxt(vcpu) ? vcpu_hvtimer(vcpu) : vcpu_vtimer(vcpu); return kvm_counter_compute_delta(ctx, val); } @@ -262,7 +284,7 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) u64 min_delta = ULLONG_MAX; int i; - for (i = 0; i < NR_KVM_TIMERS; i++) { + for (i = 0; i < nr_timers(vcpu); i++) { struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; WARN(ctx->loaded, "timer %d loaded\n", i); @@ -311,7 +333,7 @@ static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) u64 ns; ctx = container_of(hrt, struct arch_timer_context, hrtimer); - vcpu = ctx->vcpu; + vcpu = timer_context_to_vcpu(ctx); trace_kvm_timer_hrtimer_expire(ctx); @@ -345,9 +367,11 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) switch (index) { case TIMER_VTIMER: + case TIMER_HVTIMER: cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); break; case TIMER_PTIMER: + case TIMER_HPTIMER: cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); break; case NR_KVM_TIMERS: @@ -393,22 +417,40 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu) regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; } +static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) +{ + /* + * Paper over NV2 brokenness by publishing the interrupt status + * bit. This still results in a poor quality of emulation (guest + * writes will have no effect until the next exit). + * + * But hey, it's fast, right? + */ + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx); + if (is_hyp_ctxt(vcpu) && + (ctx == vcpu_vtimer(vcpu) || ctx == vcpu_ptimer(vcpu))) { + unsigned long val = timer_get_ctl(ctx); + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level); + timer_set_ctl(ctx, val); + } +} + static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { - int ret; + kvm_timer_update_status(timer_ctx, new_level); timer_ctx->irq.level = new_level; - trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, + trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); - if (!userspace_irqchip(vcpu->kvm)) { - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - timer_ctx->irq.irq, - timer_ctx->irq.level, - timer_ctx); - WARN_ON(ret); - } + if (userspace_irqchip(vcpu->kvm)) + return; + + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + timer_irq(timer_ctx), + timer_ctx->irq.level, + timer_ctx); } /* Only called for a fully emulated timer */ @@ -418,27 +460,36 @@ static void timer_emulate(struct arch_timer_context *ctx) trace_kvm_timer_emulate(ctx, should_fire); - if (should_fire != ctx->irq.level) { - kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); - return; - } + if (should_fire != ctx->irq.level) + kvm_timer_update_irq(timer_context_to_vcpu(ctx), should_fire, ctx); + + kvm_timer_update_status(ctx, should_fire); /* * If the timer can fire now, we don't need to have a soft timer * scheduled for the future. If the timer cannot fire at all, * then we also don't need a soft timer. */ - if (!kvm_timer_irq_can_fire(ctx)) { - soft_timer_cancel(&ctx->hrtimer); + if (should_fire || !kvm_timer_irq_can_fire(ctx)) return; - } soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); } +static void set_cntvoff(u64 cntvoff) +{ + kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff); +} + +static void set_cntpoff(u64 cntpoff) +{ + if (has_cntpoff()) + write_sysreg_s(cntpoff, SYS_CNTPOFF_EL2); +} + static void timer_save_state(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx)); enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; @@ -451,23 +502,53 @@ static void timer_save_state(struct arch_timer_context *ctx) goto out; switch (index) { + u64 cval; + case TIMER_VTIMER: + case TIMER_HVTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL)); - timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL)); + cval = read_sysreg_el0(SYS_CNTV_CVAL); + + if (has_broken_cntvoff()) + cval -= timer_get_offset(ctx); + + timer_set_cval(ctx, cval); /* Disable the timer */ write_sysreg_el0(0, SYS_CNTV_CTL); isb(); + /* + * The kernel may decide to run userspace after + * calling vcpu_put, so we reset cntvoff to 0 to + * ensure a consistent read between user accesses to + * the virtual counter and kernel access to the + * physical counter of non-VHE case. + * + * For VHE, the virtual counter uses a fixed virtual + * offset of zero, so no need to zero CNTVOFF_EL2 + * register, but this is actually useful when switching + * between EL1/vEL2 with NV. + * + * Do it unconditionally, as this is either unavoidable + * or dirt cheap. + */ + set_cntvoff(0); break; case TIMER_PTIMER: + case TIMER_HPTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL)); - timer_set_cval(ctx, read_sysreg_el0(SYS_CNTP_CVAL)); + cval = read_sysreg_el0(SYS_CNTP_CVAL); + + cval -= timer_get_offset(ctx); + + timer_set_cval(ctx, cval); /* Disable the timer */ write_sysreg_el0(0, SYS_CNTP_CTL); isb(); + set_cntpoff(0); break; case NR_KVM_TIMERS: BUG(); @@ -498,6 +579,7 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) */ if (!kvm_timer_irq_can_fire(map.direct_vtimer) && !kvm_timer_irq_can_fire(map.direct_ptimer) && + !kvm_timer_irq_can_fire(map.emul_vtimer) && !kvm_timer_irq_can_fire(map.emul_ptimer) && !vcpu_has_wfit_active(vcpu)) return; @@ -518,7 +600,7 @@ static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) static void timer_restore_state(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx)); enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; @@ -531,13 +613,29 @@ static void timer_restore_state(struct arch_timer_context *ctx) goto out; switch (index) { + u64 cval, offset; + case TIMER_VTIMER: - write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL); + case TIMER_HVTIMER: + cval = timer_get_cval(ctx); + offset = timer_get_offset(ctx); + if (has_broken_cntvoff()) { + set_cntvoff(0); + cval += offset; + } else { + set_cntvoff(offset); + } + write_sysreg_el0(cval, SYS_CNTV_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL); break; case TIMER_PTIMER: - write_sysreg_el0(timer_get_cval(ctx), SYS_CNTP_CVAL); + case TIMER_HPTIMER: + cval = timer_get_cval(ctx); + offset = timer_get_offset(ctx); + set_cntpoff(offset); + cval += offset; + write_sysreg_el0(cval, SYS_CNTP_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL); break; @@ -552,11 +650,6 @@ out: local_irq_restore(flags); } -static void set_cntvoff(u64 cntvoff) -{ - kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff); -} - static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) { int r; @@ -566,7 +659,7 @@ static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, boo static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) { - struct kvm_vcpu *vcpu = ctx->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx); bool phys_active = false; /* @@ -575,10 +668,10 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) * this point and the register restoration, we'll take the * interrupt anyway. */ - kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(ctx), ctx); if (irqchip_in_kernel(vcpu->kvm)) - phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); + phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx)); phys_active |= ctx->irq.level; @@ -613,6 +706,153 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } +/* If _pred is true, set bit in _set, otherwise set it in _clr */ +#define assign_clear_set_bit(_pred, _bit, _clr, _set) \ + do { \ + if (_pred) \ + (_set) |= (_bit); \ + else \ + (_clr) |= (_bit); \ + } while (0) + +static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, + struct timer_map *map) +{ + int hw, ret; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + /* + * We only ever unmap the vtimer irq on a VHE system that runs nested + * virtualization, in which case we have both a valid emul_vtimer, + * emul_ptimer, direct_vtimer, and direct_ptimer. + * + * Since this is called from kvm_timer_vcpu_load(), a change between + * vEL2 and vEL1/0 will have just happened, and the timer_map will + * represent this, and therefore we switch the emul/direct mappings + * below. + */ + hw = kvm_vgic_get_map(vcpu, timer_irq(map->direct_vtimer)); + if (hw < 0) { + kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_vtimer)); + kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_ptimer)); + + ret = kvm_vgic_map_phys_irq(vcpu, + map->direct_vtimer->host_timer_irq, + timer_irq(map->direct_vtimer), + &arch_timer_irq_ops); + WARN_ON_ONCE(ret); + ret = kvm_vgic_map_phys_irq(vcpu, + map->direct_ptimer->host_timer_irq, + timer_irq(map->direct_ptimer), + &arch_timer_irq_ops); + WARN_ON_ONCE(ret); + } +} + +static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) +{ + bool tvt, tpt, tvc, tpc, tvt02, tpt02; + u64 clr, set; + + /* + * No trapping gets configured here with nVHE. See + * __timer_enable_traps(), which is where the stuff happens. + */ + if (!has_vhe()) + return; + + /* + * Our default policy is not to trap anything. As we progress + * within this function, reality kicks in and we start adding + * traps based on emulation requirements. + */ + tvt = tpt = tvc = tpc = false; + tvt02 = tpt02 = false; + + /* + * NV2 badly breaks the timer semantics by redirecting accesses to + * the EL1 timer state to memory, so let's call ECV to the rescue if + * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses. + * + * The treatment slightly varies depending whether we run a nVHE or + * VHE guest: nVHE will use the _EL0 registers directly, while VHE + * will use the _EL02 accessors. This translates in different trap + * bits. + * + * None of the trapping is required when running in non-HYP context, + * unless required by the L1 hypervisor settings once we advertise + * ECV+NV in the guest, or that we need trapping for other reasons. + */ + if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) { + if (vcpu_el2_e2h_is_set(vcpu)) + tvt02 = tpt02 = true; + else + tvt = tpt = true; + } + + /* + * We have two possibility to deal with a physical offset: + * + * - Either we have CNTPOFF (yay!) or the offset is 0: + * we let the guest freely access the HW + * + * - or neither of these condition apply: + * we trap accesses to the HW, but still use it + * after correcting the physical offset + */ + if (!has_cntpoff() && timer_get_offset(map->direct_ptimer)) + tpt = tpc = true; + + /* + * For the poor sods that could not correctly subtract one value + * from another, trap the full virtual timer and counter. + */ + if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer)) + tvt = tvc = true; + + /* + * Apply the enable bits that the guest hypervisor has requested for + * its own guest. We can only add traps that wouldn't have been set + * above. + * Implementation choices: we do not support NV when E2H=0 in the + * guest, and we don't support configuration where E2H is writable + * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but + * not both). This simplifies the handling of the EL1NV* bits. + */ + if (is_nested_ctxt(vcpu)) { + u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + /* Use the VHE format for mental sanity */ + if (!vcpu_el2_e2h_is_set(vcpu)) + val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10; + + tpt |= !(val & (CNTHCTL_EL1PCEN << 10)); + tpc |= !(val & (CNTHCTL_EL1PCTEN << 10)); + + tpt02 |= (val & CNTHCTL_EL1NVPCT); + tvt02 |= (val & CNTHCTL_EL1NVVCT); + } + + /* + * Now that we have collected our requirements, compute the + * trap and enable bits. + */ + set = 0; + clr = 0; + + assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr); + assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr); + assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set); + assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set); + assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set); + assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set); + + /* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */ + sysreg_clear_set(cnthctl_el2, clr, set); +} + void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); @@ -624,6 +864,9 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) get_timer_map(vcpu, &map); if (static_branch_likely(&has_gic_active_state)) { + if (vcpu_has_nv(vcpu)) + kvm_timer_vcpu_load_nested_switch(vcpu, &map); + kvm_timer_vcpu_load_gic(map.direct_vtimer); if (map.direct_ptimer) kvm_timer_vcpu_load_gic(map.direct_ptimer); @@ -631,16 +874,17 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) kvm_timer_vcpu_load_nogic(vcpu); } - set_cntvoff(timer_get_offset(map.direct_vtimer)); - kvm_timer_unblocking(vcpu); timer_restore_state(map.direct_vtimer); if (map.direct_ptimer) timer_restore_state(map.direct_ptimer); - + if (map.emul_vtimer) + timer_emulate(map.emul_vtimer); if (map.emul_ptimer) timer_emulate(map.emul_ptimer); + + timer_set_traps(vcpu, &map); } bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) @@ -683,20 +927,51 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ + if (map.emul_vtimer) + soft_timer_cancel(&map.emul_vtimer->hrtimer); if (map.emul_ptimer) soft_timer_cancel(&map.emul_ptimer->hrtimer); if (kvm_vcpu_is_blocking(vcpu)) kvm_timer_blocking(vcpu); +} +void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) +{ /* - * The kernel may decide to run userspace after calling vcpu_put, so - * we reset cntvoff to 0 to ensure a consistent read between user - * accesses to the virtual counter and kernel access to the physical - * counter of non-VHE case. For VHE, the virtual counter uses a fixed - * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. + * When NV2 is on, guest hypervisors have their EL1 timer register + * accesses redirected to the VNCR page. Any guest action taken on + * the timer is postponed until the next exit, leading to a very + * poor quality of emulation. + * + * This is an unmitigated disaster, only papered over by FEAT_ECV, + * which allows trapping of the timer registers even with NV2. + * Still, this is still worse than FEAT_NV on its own. Meh. */ - set_cntvoff(0); + if (!cpus_have_final_cap(ARM64_HAS_ECV)) { + /* + * For a VHE guest hypervisor, the EL2 state is directly + * stored in the host EL1 timers, while the emulated EL1 + * state is stored in the VNCR page. The latter could have + * been updated behind our back, and we must reset the + * emulation of the timers. + * + * A non-VHE guest hypervisor doesn't have any direct access + * to its timers: the EL2 registers trap despite being + * notionally direct (we use the EL1 HW, as for VHE), while + * the EL1 registers access memory. + * + * In both cases, process the emulated timers on each guest + * exit. Boo. + */ + struct timer_map map; + get_timer_map(vcpu, &map); + + soft_timer_cancel(&map.emul_vtimer->hrtimer); + soft_timer_cancel(&map.emul_ptimer->hrtimer); + timer_emulate(map.emul_vtimer); + timer_emulate(map.emul_ptimer); + } } /* @@ -728,7 +1003,7 @@ void kvm_timer_sync_user(struct kvm_vcpu *vcpu) unmask_vtimer_irq_user(vcpu); } -int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) +void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; @@ -741,113 +1016,100 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) * resets the timer to be disabled and unmasked and is compliant with * the ARMv7 architecture. */ - timer_set_ctl(vcpu_vtimer(vcpu), 0); - timer_set_ctl(vcpu_ptimer(vcpu), 0); + for (int i = 0; i < nr_timers(vcpu); i++) + timer_set_ctl(vcpu_get_timer(vcpu, i), 0); + + /* + * A vcpu running at EL2 is in charge of the offset applied to + * the virtual timer, so use the physical VM offset, and point + * the vcpu offset to CNTVOFF_EL2. + */ + if (vcpu_has_nv(vcpu)) { + struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset; + + offs->vcpu_offset = __ctxt_sys_reg(&vcpu->arch.ctxt, CNTVOFF_EL2); + offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset; + } if (timer->enabled) { - kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu)); - kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu)); + for (int i = 0; i < nr_timers(vcpu); i++) + kvm_timer_update_irq(vcpu, false, + vcpu_get_timer(vcpu, i)); if (irqchip_in_kernel(vcpu->kvm)) { - kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq); + kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_vtimer)); if (map.direct_ptimer) - kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq); + kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_ptimer)); } } + if (map.emul_vtimer) + soft_timer_cancel(&map.emul_vtimer->hrtimer); if (map.emul_ptimer) soft_timer_cancel(&map.emul_ptimer->hrtimer); - - return 0; } -/* Make the updates of cntvoff for all vtimer contexts atomic */ -static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) +static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) { - unsigned long i; + struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid); struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *tmp; - mutex_lock(&kvm->lock); - kvm_for_each_vcpu(i, tmp, kvm) - timer_set_offset(vcpu_vtimer(tmp), cntvoff); + ctxt->timer_id = timerid; - /* - * When called from the vcpu create path, the CPU being created is not - * included in the loop above, so we just set it here as well. - */ - timer_set_offset(vcpu_vtimer(vcpu), cntvoff); - mutex_unlock(&kvm->lock); + if (timerid == TIMER_VTIMER) + ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; + else + ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; + + hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + + switch (timerid) { + case TIMER_PTIMER: + case TIMER_HPTIMER: + ctxt->host_timer_irq = host_ptimer_irq; + break; + case TIMER_VTIMER: + case TIMER_HVTIMER: + ctxt->host_timer_irq = host_vtimer_irq; + break; + } } void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - vtimer->vcpu = vcpu; - ptimer->vcpu = vcpu; - /* Synchronize cntvoff across all vtimers of a VM. */ - update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); - timer_set_offset(ptimer, 0); + for (int i = 0; i < NR_KVM_TIMERS; i++) + timer_context_init(vcpu, i); - hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - timer->bg_timer.function = kvm_bg_timer_expire; - - hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - vtimer->hrtimer.function = kvm_hrtimer_expire; - ptimer->hrtimer.function = kvm_hrtimer_expire; - - vtimer->irq.irq = default_vtimer_irq.irq; - ptimer->irq.irq = default_ptimer_irq.irq; + /* Synchronize offsets across timers of a VM if not already provided */ + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { + timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read()); + timer_set_offset(vcpu_ptimer(vcpu), 0); + } - vtimer->host_timer_irq = host_vtimer_irq; - ptimer->host_timer_irq = host_ptimer_irq; + hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); +} - vtimer->host_timer_irq_flags = host_vtimer_irq_flags; - ptimer->host_timer_irq_flags = host_ptimer_irq_flags; +void kvm_timer_init_vm(struct kvm *kvm) +{ + for (int i = 0; i < NR_KVM_TIMERS; i++) + kvm->arch.timer_data.ppi[i] = default_ppi[i]; } -static void kvm_timer_init_interrupt(void *info) +void kvm_timer_cpu_up(void) { enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); - enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); + if (host_ptimer_irq) + enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); } -int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) +void kvm_timer_cpu_down(void) { - struct arch_timer_context *timer; - - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - timer = vcpu_vtimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); - break; - case KVM_REG_ARM_TIMER_CNT: - timer = vcpu_vtimer(vcpu); - update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); - break; - case KVM_REG_ARM_TIMER_CVAL: - timer = vcpu_vtimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); - break; - case KVM_REG_ARM_PTIMER_CTL: - timer = vcpu_ptimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); - break; - case KVM_REG_ARM_PTIMER_CVAL: - timer = vcpu_ptimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); - break; - - default: - return -1; - } - - return 0; + disable_percpu_irq(host_vtimer_irq); + if (host_ptimer_irq) + disable_percpu_irq(host_ptimer_irq); } static u64 read_timer_ctl(struct arch_timer_context *timer) @@ -866,31 +1128,6 @@ static u64 read_timer_ctl(struct arch_timer_context *timer) return ctl; } -u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) -{ - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - return kvm_arm_timer_read(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CTL); - case KVM_REG_ARM_TIMER_CNT: - return kvm_arm_timer_read(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CNT); - case KVM_REG_ARM_TIMER_CVAL: - return kvm_arm_timer_read(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CVAL); - case KVM_REG_ARM_PTIMER_CTL: - return kvm_arm_timer_read(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CTL); - case KVM_REG_ARM_PTIMER_CNT: - return kvm_arm_timer_read(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CNT); - case KVM_REG_ARM_PTIMER_CVAL: - return kvm_arm_timer_read(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CVAL); - } - return (u64)-1; -} - static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, struct arch_timer_context *timer, enum kvm_arch_timer_regs treg) @@ -915,6 +1152,10 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, val = kvm_phys_timer_read() - timer_get_offset(timer); break; + case TIMER_REG_VOFF: + val = *timer->offset.vcpu_offset; + break; + default: BUG(); } @@ -926,14 +1167,22 @@ u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, enum kvm_arch_timer_regs treg) { + struct arch_timer_context *timer; + struct timer_map map; u64 val; + get_timer_map(vcpu, &map); + timer = vcpu_get_timer(vcpu, tmr); + + if (timer == map.emul_vtimer || timer == map.emul_ptimer) + return kvm_arm_timer_read(vcpu, timer, treg); + preempt_disable(); - kvm_timer_vcpu_put(vcpu); + timer_save_state(timer); - val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg); + val = kvm_arm_timer_read(vcpu, timer, treg); - kvm_timer_vcpu_load(vcpu); + timer_restore_state(timer); preempt_enable(); return val; @@ -957,6 +1206,10 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, timer_set_cval(timer, val); break; + case TIMER_REG_VOFF: + *timer->offset.vcpu_offset = val; + break; + default: BUG(); } @@ -967,25 +1220,22 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timer_regs treg, u64 val) { - preempt_disable(); - kvm_timer_vcpu_put(vcpu); - - kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val); - - kvm_timer_vcpu_load(vcpu); - preempt_enable(); -} - -static int kvm_timer_starting_cpu(unsigned int cpu) -{ - kvm_timer_init_interrupt(NULL); - return 0; -} + struct arch_timer_context *timer; + struct timer_map map; -static int kvm_timer_dying_cpu(unsigned int cpu) -{ - disable_percpu_irq(host_vtimer_irq); - return 0; + get_timer_map(vcpu, &map); + timer = vcpu_get_timer(vcpu, tmr); + if (timer == map.emul_vtimer || timer == map.emul_ptimer) { + soft_timer_cancel(&timer->hrtimer); + kvm_arm_timer_write(vcpu, timer, treg, val); + timer_emulate(timer); + } else { + preempt_disable(); + timer_save_state(timer); + kvm_arm_timer_write(vcpu, timer, treg, val); + timer_restore_state(timer); + preempt_enable(); + } } static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) @@ -1055,10 +1305,6 @@ static const struct irq_domain_ops timer_domain_ops = { .free = timer_irq_domain_free, }; -static struct irq_ops arch_timer_irq_ops = { - .get_input_level = kvm_arch_timer_get_input_level, -}; - static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags) { *flags = irq_get_trigger_type(virq); @@ -1117,7 +1363,38 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return 0; } -int kvm_timer_hyp_init(bool has_gic) +static void kvm_timer_handle_errata(void) +{ + u64 mmfr0, mmfr1, mmfr4; + + /* + * CNTVOFF_EL2 is broken on some implementations. For those, we trap + * all virtual timer/counter accesses, requiring FEAT_ECV. + * + * However, a hypervisor supporting nesting is likely to mitigate the + * erratum at L0, and not require other levels to mitigate it (which + * would otherwise be a terrible performance sink due to trap + * amplification). + * + * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0, + * and that NV is likely not to (because of limitations of the + * architecture), only enable the workaround when FEAT_VHE and + * FEAT_E2H0 are both detected. Time will tell if this actually holds. + */ + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1); + if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) && + !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) && + SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) && + (has_vhe() || has_hvhe()) && + cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) { + static_branch_enable(&broken_cntvoff_key); + kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n"); + } +} + +int __init kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; int err; @@ -1149,7 +1426,7 @@ int kvm_timer_hyp_init(bool has_gic) kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); - goto out_free_irq; + goto out_free_vtimer_irq; } static_branch_enable(&has_gic_active_state); @@ -1165,7 +1442,7 @@ int kvm_timer_hyp_init(bool has_gic) if (err) { kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", host_ptimer_irq, err); - return err; + goto out_free_vtimer_irq; } if (has_gic) { @@ -1173,7 +1450,7 @@ int kvm_timer_hyp_init(bool has_gic) kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); - goto out_free_irq; + goto out_free_ptimer_irq; } } @@ -1182,14 +1459,16 @@ int kvm_timer_hyp_init(bool has_gic) kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", info->physical_irq); err = -ENODEV; - goto out_free_irq; + goto out_free_vtimer_irq; } - cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, - "kvm/arm/timer:starting", kvm_timer_starting_cpu, - kvm_timer_dying_cpu); + kvm_timer_handle_errata(); return 0; -out_free_irq: + +out_free_ptimer_irq: + if (info->physical_irq > 0) + free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus()); +out_free_vtimer_irq: free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); return err; } @@ -1203,44 +1482,56 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) { - int vtimer_irq, ptimer_irq, ret; - unsigned long i; + u32 ppis = 0; + bool valid; - vtimer_irq = vcpu_vtimer(vcpu)->irq.irq; - ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu)); - if (ret) - return false; + mutex_lock(&vcpu->kvm->arch.config_lock); - ptimer_irq = vcpu_ptimer(vcpu)->irq.irq; - ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu)); - if (ret) - return false; + for (int i = 0; i < nr_timers(vcpu); i++) { + struct arch_timer_context *ctx; + int irq; - kvm_for_each_vcpu(i, vcpu, vcpu->kvm) { - if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq || - vcpu_ptimer(vcpu)->irq.irq != ptimer_irq) - return false; + ctx = vcpu_get_timer(vcpu, i); + irq = timer_irq(ctx); + if (kvm_vgic_set_owner(vcpu, irq, ctx)) + break; + + /* + * We know by construction that we only have PPIs, so + * all values are less than 32. + */ + ppis |= BIT(irq); } - return true; + valid = hweight32(ppis) == nr_timers(vcpu); + + if (valid) + set_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags); + + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return valid; } -bool kvm_arch_timer_get_input_level(int vintid) +static bool kvm_arch_timer_get_input_level(int vintid) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - struct arch_timer_context *timer; if (WARN(!vcpu, "No vcpu context!\n")) return false; - if (vintid == vcpu_vtimer(vcpu)->irq.irq) - timer = vcpu_vtimer(vcpu); - else if (vintid == vcpu_ptimer(vcpu)->irq.irq) - timer = vcpu_ptimer(vcpu); - else - BUG(); + for (int i = 0; i < nr_timers(vcpu); i++) { + struct arch_timer_context *ctx; + + ctx = vcpu_get_timer(vcpu, i); + if (timer_irq(ctx) == vintid) + return kvm_timer_should_fire(ctx); + } + + /* A timer IRQ has fired, but no matching timer was found? */ + WARN_RATELIMIT(1, "timer INTID%d unknown\n", vintid); - return kvm_timer_should_fire(timer); + return false; } int kvm_timer_enable(struct kvm_vcpu *vcpu) @@ -1269,7 +1560,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) ret = kvm_vgic_map_phys_irq(vcpu, map.direct_vtimer->host_timer_irq, - map.direct_vtimer->irq.irq, + timer_irq(map.direct_vtimer), &arch_timer_irq_ops); if (ret) return ret; @@ -1277,7 +1568,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) if (map.direct_ptimer) { ret = kvm_vgic_map_phys_irq(vcpu, map.direct_ptimer->host_timer_irq, - map.direct_ptimer->irq.irq, + timer_irq(map.direct_ptimer), &arch_timer_irq_ops); } @@ -1289,45 +1580,17 @@ no_vgic: return 0; } -/* - * On VHE system, we only need to configure the EL2 timer trap register once, - * not for every world switch. - * The host kernel runs at EL2 with HCR_EL2.TGE == 1, - * and this makes those bits have no effect for the host kernel execution. - */ +/* If we have CNTPOFF, permanently set ECV to enable it */ void kvm_timer_init_vhe(void) { - /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */ - u32 cnthctl_shift = 10; - u64 val; - - /* - * VHE systems allow the guest direct access to the EL1 physical - * timer/counter. - */ - val = read_sysreg(cnthctl_el2); - val |= (CNTHCTL_EL1PCEN << cnthctl_shift); - val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); - write_sysreg(val, cnthctl_el2); -} - -static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq) -{ - struct kvm_vcpu *vcpu; - unsigned long i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu_vtimer(vcpu)->irq.irq = vtimer_irq; - vcpu_ptimer(vcpu)->irq.irq = ptimer_irq; - } + if (cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)) + sysreg_clear_set(cnthctl_el2, 0, CNTHCTL_ECV); } int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int __user *uaddr = (int __user *)(long)attr->addr; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - int irq; + int irq, idx, ret = 0; if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; @@ -1338,21 +1601,42 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!(irq_is_ppi(irq))) return -EINVAL; - if (vcpu->arch.timer_cpu.enabled) - return -EBUSY; + mutex_lock(&vcpu->kvm->arch.config_lock); + + if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, + &vcpu->kvm->arch.flags)) { + ret = -EBUSY; + goto out; + } switch (attr->attr) { case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: - set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq); + idx = TIMER_VTIMER; break; case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: - set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq); + idx = TIMER_PTIMER; + break; + case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: + idx = TIMER_HVTIMER; + break; + case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: + idx = TIMER_HPTIMER; break; default: - return -ENXIO; + ret = -ENXIO; + goto out; } - return 0; + /* + * We cannot validate the IRQ unicity before we run, so take it at + * face value. The verdict will be given on first vcpu run, for each + * vcpu. Yes this is late. Blame it on the stupid API. + */ + vcpu->kvm->arch.timer_data.ppi[idx] = irq; + +out: + mutex_unlock(&vcpu->kvm->arch.config_lock); + return ret; } int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) @@ -1368,11 +1652,17 @@ int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: timer = vcpu_ptimer(vcpu); break; + case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: + timer = vcpu_hvtimer(vcpu); + break; + case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: + timer = vcpu_hptimer(vcpu); + break; default: return -ENXIO; } - irq = timer->irq.irq; + irq = timer_irq(timer); return put_user(irq, uaddr); } @@ -1381,8 +1671,42 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: + case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: return 0; } return -ENXIO; } + +int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, + struct kvm_arm_counter_offset *offset) +{ + int ret = 0; + + if (offset->reserved) + return -EINVAL; + + mutex_lock(&kvm->lock); + + if (!kvm_trylock_all_vcpus(kvm)) { + set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags); + + /* + * If userspace decides to set the offset using this + * API rather than merely restoring the counter + * values, the offset applies to both the virtual and + * physical views. + */ + kvm->arch.timer_data.voffset = offset->counter_offset; + kvm->arch.timer_data.poffset = offset->counter_offset; + + kvm_unlock_all_vcpus(kvm); + } else { + ret = -EBUSY; + } + + mutex_unlock(&kvm->lock); + + return ret; +} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9c5573bc4614..4f80da0c0d1d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -6,7 +6,6 @@ #include <linux/bug.h> #include <linux/cpu_pm.h> -#include <linux/entry-kvm.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/kvm_host.h> @@ -16,7 +15,6 @@ #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> -#include <linux/kmemleak.h> #include <linux/kvm.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> @@ -36,51 +34,62 @@ #include <asm/virt.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/kvm_pkvm.h> -#include <asm/kvm_emulate.h> +#include <asm/kvm_ptrauth.h> #include <asm/sections.h> #include <kvm/arm_hypercalls.h> #include <kvm/arm_pmu.h> #include <kvm/arm_psci.h> +#include "sys_regs.h" + static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; -DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); + +enum kvm_wfx_trap_policy { + KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */ + KVM_WFX_NOTRAP, + KVM_WFX_TRAP, +}; + +static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; +static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); -DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base); DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); -static bool vgic_present; +DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); -static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); -DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); +static bool vgic_present, kvm_arm_initialised; -int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; -} +static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized); -int kvm_arch_hardware_setup(void *opaque) +bool is_kvm_arm_initialised(void) { - return 0; + return kvm_arm_initialised; } -int kvm_arch_check_processor_compat(void *opaque) +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { - return 0; + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { - int r; + int r = -EINVAL; if (cap->flags) return -EINVAL; + if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap)) + return -EINVAL; + switch (cap->cap) { case KVM_CAP_ARM_NISV_TO_USER: r = 0; @@ -89,9 +98,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; case KVM_CAP_ARM_MTE: mutex_lock(&kvm->lock); - if (!system_supports_mte() || kvm->created_vcpus) { - r = -EINVAL; - } else { + if (system_supports_mte() && !kvm->created_vcpus) { r = 0; set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); } @@ -101,8 +108,35 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = 0; set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + mutex_lock(&kvm->slots_lock); + /* + * To keep things simple, allow changing the chunk + * size only when no memory slots have been created. + */ + if (kvm_are_all_memslots_empty(kvm)) { + u64 new_cap = cap->args[0]; + + if (!new_cap || kvm_is_block_size_supported(new_cap)) { + r = 0; + kvm->arch.mmu.split_page_chunk_size = new_cap; + } + } + mutex_unlock(&kvm->slots_lock); + break; + case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + r = 0; + set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags); + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_ARM_SEA_TO_USER: + r = 0; + set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags); + break; default: - r = -EINVAL; break; } @@ -114,39 +148,32 @@ static int kvm_arm_default_max_vcpus(void) return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; } -static void set_default_spectre(struct kvm *kvm) -{ - /* - * The default is to expose CSV2 == 1 if the HW isn't affected. - * Although this is a per-CPU feature, we make it global because - * asymmetric systems are just a nuisance. - * - * Userspace can override this as long as it doesn't promise - * the impossible. - */ - if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) - kvm->arch.pfr0_csv2 = 1; - if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) - kvm->arch.pfr0_csv3 = 1; -} - /** * kvm_arch_init_vm - initializes a VM data structure * @kvm: pointer to the KVM struct + * @type: kvm device type */ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; + mutex_init(&kvm->arch.config_lock); + +#ifdef CONFIG_LOCKDEP + /* Clue in lockdep that the config_lock must be taken inside kvm->lock */ + mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); + mutex_unlock(&kvm->arch.config_lock); + mutex_unlock(&kvm->lock); +#endif + + kvm_init_nested(kvm); + ret = kvm_share_hyp(kvm, kvm + 1); if (ret) return ret; - ret = pkvm_init_host_vm(kvm); - if (ret) - goto err_unshare_kvm; - - if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) { ret = -ENOMEM; goto err_unshare_kvm; } @@ -156,19 +183,26 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto err_free_cpumask; + if (is_protected_kvm_enabled()) { + /* + * If any failures occur after this is successful, make sure to + * call __pkvm_unreserve_vm to unreserve the VM in hyp. + */ + ret = pkvm_init_host_vm(kvm); + if (ret) + goto err_free_cpumask; + } + kvm_vgic_early_init(kvm); + kvm_timer_init_vm(kvm); + /* The maximum number of VCPUs is limited by the host's GIC model */ kvm->max_vcpus = kvm_arm_default_max_vcpus(); - set_default_spectre(kvm); kvm_arm_init_hypercalls(kvm); - /* - * Initialise the default PMUver before there is a chance to - * create an actual PMU. - */ - kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit(); + bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); return 0; @@ -184,6 +218,28 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + kvm_sys_regs_create_debugfs(kvm); + kvm_s2_ptdump_create_debugfs(kvm); +} + +static void kvm_destroy_mpidr_data(struct kvm *kvm) +{ + struct kvm_mpidr_data *data; + + mutex_lock(&kvm->arch.config_lock); + + data = rcu_dereference_protected(kvm->arch.mpidr_data, + lockdep_is_held(&kvm->arch.config_lock)); + if (data) { + rcu_assign_pointer(kvm->arch.mpidr_data, NULL); + synchronize_rcu(); + kfree(data); + } + + mutex_unlock(&kvm->arch.config_lock); +} /** * kvm_arch_destroy_vm - destroy the VM data structure @@ -199,20 +255,62 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (is_protected_kvm_enabled()) pkvm_destroy_hyp_vm(kvm); + kvm_destroy_mpidr_data(kvm); + + kfree(kvm->arch.sysreg_masks); kvm_destroy_vcpus(kvm); kvm_unshare_hyp(kvm, kvm + 1); + + kvm_arm_teardown_hypercalls(kvm); +} + +static bool kvm_has_full_ptr_auth(void) +{ + bool apa, gpa, api, gpi, apa3, gpa3; + u64 isar1, isar2, val; + + /* + * Check that: + * + * - both Address and Generic auth are implemented for a given + * algorithm (Q5, IMPDEF or Q3) + * - only a single algorithm is implemented. + */ + if (!system_has_full_ptr_auth()) + return false; + + isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); + isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + + apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1); + val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1); + gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP); + + api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1); + val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1); + gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP); + + apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2); + val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2); + gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP); + + return (apa == gpa && api == gpi && apa3 == gpa3 && + (apa + api + apa3) == 1); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; + + if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext)) + return 0; + switch (ext) { case KVM_CAP_IRQCHIP: r = vgic_present; break; case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: @@ -230,6 +328,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: case KVM_CAP_ARM_SYSTEM_SUSPEND: + case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_COUNTER_OFFSET: + case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + case KVM_CAP_ARM_SEA_TO_USER: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -274,7 +376,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_arm_pvtime_supported(); break; case KVM_CAP_ARM_EL1_32BIT: - r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); + r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1); + break; + case KVM_CAP_ARM_EL2: + r = cpus_have_final_cap(ARM64_HAS_NESTED_VIRT); + break; + case KVM_CAP_ARM_EL2_E2H0: + r = cpus_have_final_cap(ARM64_HAS_HCR_NV1); break; case KVM_CAP_GUEST_DEBUG_HW_BPS: r = get_num_brps(); @@ -283,10 +391,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = get_num_wrps(); break; case KVM_CAP_ARM_PMU_V3: - r = kvm_arm_support_pmu_v3(); + r = kvm_supports_guest_pmuv3(); break; case KVM_CAP_ARM_INJECT_SERROR_ESR: - r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); + r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); break; case KVM_CAP_ARM_VM_IPA_SIZE: r = get_kvm_ipa_limit(); @@ -296,8 +404,27 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_ARM_PTRAUTH_ADDRESS: case KVM_CAP_ARM_PTRAUTH_GENERIC: - r = system_has_full_ptr_auth(); + r = kvm_has_full_ptr_auth(); break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + if (kvm) + r = kvm->arch.mmu.split_page_chunk_size; + else + r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + break; + case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES: + r = kvm_supported_block_sizes(); + break; + case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES: + r = BIT(0); + break; + case KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED: + if (!kvm) + r = -EINVAL; + else + r = kvm_supports_cacheable_pfnmap(); + break; + default: r = 0; } @@ -318,7 +445,7 @@ struct kvm *kvm_arch_alloc_vm(void) if (!has_vhe()) return kzalloc(sz, GFP_KERNEL_ACCOUNT); - return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); + return kvzalloc(sz, GFP_KERNEL_ACCOUNT); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) @@ -336,34 +463,46 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int err; + spin_lock_init(&vcpu->arch.mp_state_lock); + +#ifdef CONFIG_LOCKDEP + /* Inform lockdep that the config_lock is acquired after vcpu->mutex */ + mutex_lock(&vcpu->mutex); + mutex_lock(&vcpu->kvm->arch.config_lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); + mutex_unlock(&vcpu->mutex); +#endif + /* Force users to call KVM_ARM_VCPU_INIT */ - vcpu->arch.target = -1; - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + vcpu_clear_flag(vcpu, VCPU_INITIALIZED); vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; - /* - * Default value for the FP state, will be overloaded at load - * time if we support FP (pretty likely) - */ - vcpu->arch.fp_state = FP_STATE_FREE; - /* Set up the timer */ kvm_timer_vcpu_init(vcpu); kvm_pmu_vcpu_init(vcpu); - kvm_arm_reset_debug_ptr(vcpu); - kvm_arm_pvtime_vcpu_init(&vcpu->arch); vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + /* + * This vCPU may have been created after mpidr_data was initialized. + * Throw out the pre-computed mappings if that is the case which forces + * KVM to fall back to iteratively searching the vCPUs. + */ + kvm_destroy_mpidr_data(vcpu->kvm); + err = kvm_vgic_vcpu_init(vcpu); if (err) return err; - return kvm_share_hyp(vcpu, vcpu + 1); + err = kvm_share_hyp(vcpu, vcpu + 1); + if (err) + kvm_vgic_vcpu_destroy(vcpu); + + return err; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -372,13 +511,13 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm))) - static_branch_dec(&userspace_irqchip_in_use); - - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + if (!is_protected_kvm_enabled()) + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + else + free_hyp_memcache(&vcpu->arch.pkvm_memcache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); - + kvm_vgic_vcpu_destroy(vcpu); kvm_arm_vcpu_destroy(vcpu); } @@ -392,15 +531,81 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) } +static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) { + /* + * Either we're running an L2 guest, and the API/APK bits come + * from L1's HCR_EL2, or API/APK are both set. + */ + if (unlikely(is_nested_ctxt(vcpu))) { + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + val &= (HCR_API | HCR_APK); + vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK); + vcpu->arch.hcr_el2 |= val; + } else { + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + } + + /* + * Save the host keys if there is any chance for the guest + * to use pauth, as the entry code will reload the guest + * keys in that case. + */ + if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) { + struct kvm_cpu_context *ctxt; + + ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt); + ptrauth_save_keys(ctxt); + } + } +} + +static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) + return kvm_wfi_trap_policy == KVM_WFX_NOTRAP; + + return single_task_running() && + (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) || + vcpu->kvm->arch.vgic.nassgireq); +} + +static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) + return kvm_wfe_trap_policy == KVM_WFX_NOTRAP; + + return single_task_running(); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_s2_mmu *mmu; int *last_ran; + if (is_protected_kvm_enabled()) + goto nommu; + + if (vcpu_has_nv(vcpu)) + kvm_vcpu_load_hw_mmu(vcpu); + mmu = vcpu->arch.hw_mmu; last_ran = this_cpu_ptr(mmu->last_vcpu_ran); /* + * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2, + * which happens eagerly in VHE. + * + * Also, the VMID allocator only preserves VMIDs that are active at the + * time of rollover, so KVM might need to grab a new VMID for the MMU if + * this is called from kvm_sched_in(). + */ + kvm_arm_vmid_update(&mmu->vmid); + + /* * We guarantee that both TLBs and I-cache are private to each * vcpu. If detecting that a vcpu from the same VM has * previously run on the same physical CPU, call into the @@ -409,78 +614,110 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * We might get preempted before the vCPU actually runs, but * over-invalidation doesn't affect correctness. */ - if (*last_ran != vcpu->vcpu_id) { + if (*last_ran != vcpu->vcpu_idx) { kvm_call_hyp(__kvm_flush_cpu_context, mmu); - *last_ran = vcpu->vcpu_id; + *last_ran = vcpu->vcpu_idx; } +nommu: vcpu->cpu = cpu; - kvm_vgic_load(vcpu); + /* + * The timer must be loaded before the vgic to correctly set up physical + * interrupt deactivation in nested state (e.g. timer interrupt). + */ kvm_timer_vcpu_load(vcpu); + kvm_vgic_load(vcpu); + kvm_vcpu_load_debug(vcpu); + kvm_vcpu_load_fgt(vcpu); if (has_vhe()) - kvm_vcpu_load_sysregs_vhe(vcpu); + kvm_vcpu_load_vhe(vcpu); kvm_arch_vcpu_load_fp(vcpu); kvm_vcpu_pmu_restore_guest(vcpu); if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); - if (single_task_running()) - vcpu_clear_wfx_traps(vcpu); + if (kvm_vcpu_should_clear_twe(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_TWE; else - vcpu_set_wfx_traps(vcpu); + vcpu->arch.hcr_el2 |= HCR_TWE; - if (vcpu_has_ptrauth(vcpu)) - vcpu_ptrauth_disable(vcpu); - kvm_arch_vcpu_load_debug_state_flags(vcpu); + if (kvm_vcpu_should_clear_twi(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_TWI; + else + vcpu->arch.hcr_el2 |= HCR_TWI; + + vcpu_set_pauth_traps(vcpu); + + if (is_protected_kvm_enabled()) { + kvm_call_hyp_nvhe(__pkvm_vcpu_load, + vcpu->kvm->arch.pkvm.handle, + vcpu->vcpu_idx, vcpu->arch.hcr_el2); + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + } - if (!cpumask_test_cpu(smp_processor_id(), vcpu->kvm->arch.supported_cpus)) + if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_arch_vcpu_put_debug_state_flags(vcpu); + if (is_protected_kvm_enabled()) { + kvm_call_hyp(__vgic_v3_save_aprs, &vcpu->arch.vgic_cpu.vgic_v3); + kvm_call_hyp_nvhe(__pkvm_vcpu_put); + } + + kvm_vcpu_put_debug(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) - kvm_vcpu_put_sysregs_vhe(vcpu); + kvm_vcpu_put_vhe(vcpu); kvm_timer_vcpu_put(vcpu); kvm_vgic_put(vcpu); kvm_vcpu_pmu_restore_host(vcpu); + if (vcpu_has_nv(vcpu)) + kvm_vcpu_put_hw_mmu(vcpu); kvm_arm_vmid_clear_active(); vcpu_clear_on_unsupported_cpu(vcpu); vcpu->cpu = -1; } -void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) +static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) { - vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); kvm_make_request(KVM_REQ_SLEEP, vcpu); kvm_vcpu_kick(vcpu); } +void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->arch.mp_state_lock); + __kvm_arm_vcpu_power_off(vcpu); + spin_unlock(&vcpu->arch.mp_state_lock); +} + bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu) { - return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED; + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED; } static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu) { - vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED); kvm_make_request(KVM_REQ_SUSPEND, vcpu); kvm_vcpu_kick(vcpu); } static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu) { - return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED; + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED; } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - *mp_state = vcpu->arch.mp_state; + *mp_state = READ_ONCE(vcpu->arch.mp_state); return 0; } @@ -490,12 +727,14 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { int ret = 0; + spin_lock(&vcpu->arch.mp_state_lock); + switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.mp_state = *mp_state; + WRITE_ONCE(vcpu->arch.mp_state, *mp_state); break; case KVM_MP_STATE_STOPPED: - kvm_arm_vcpu_power_off(vcpu); + __kvm_arm_vcpu_power_off(vcpu); break; case KVM_MP_STATE_SUSPENDED: kvm_arm_vcpu_suspend(vcpu); @@ -504,6 +743,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, ret = -EINVAL; } + spin_unlock(&vcpu->arch.mp_state_lock); + return ret; } @@ -516,7 +757,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, */ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); + bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF | HCR_VSE); + return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) && !kvm_arm_vcpu_stopped(v) && !v->arch.pause); } @@ -533,9 +775,56 @@ unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) } #endif -static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) +static void kvm_init_mpidr_data(struct kvm *kvm) { - return vcpu->arch.target >= 0; + struct kvm_mpidr_data *data = NULL; + unsigned long c, mask, nr_entries; + u64 aff_set = 0, aff_clr = ~0UL; + struct kvm_vcpu *vcpu; + + mutex_lock(&kvm->arch.config_lock); + + if (rcu_access_pointer(kvm->arch.mpidr_data) || + atomic_read(&kvm->online_vcpus) == 1) + goto out; + + kvm_for_each_vcpu(c, vcpu, kvm) { + u64 aff = kvm_vcpu_get_mpidr_aff(vcpu); + aff_set |= aff; + aff_clr &= aff; + } + + /* + * A significant bit can be either 0 or 1, and will only appear in + * aff_set. Use aff_clr to weed out the useless stuff. + */ + mask = aff_set ^ aff_clr; + nr_entries = BIT_ULL(hweight_long(mask)); + + /* + * Don't let userspace fool us. If we need more than a single page + * to describe the compressed MPIDR array, just fall back to the + * iterative method. Single vcpu VMs do not need this either. + */ + if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE) + data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries), + GFP_KERNEL_ACCOUNT); + + if (!data) + goto out; + + data->mpidr_mask = mask; + + kvm_for_each_vcpu(c, vcpu, kvm) { + u64 aff = kvm_vcpu_get_mpidr_aff(vcpu); + u16 index = kvm_mpidr_index(data, aff); + + data->cmpidr_to_idx[index] = c; + } + + rcu_assign_pointer(kvm->arch.mpidr_data, data); +out: + mutex_unlock(&kvm->arch.config_lock); } /* @@ -554,14 +843,10 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) if (!kvm_arm_vcpu_is_finalized(vcpu)) return -EPERM; - ret = kvm_arch_vcpu_run_map_fp(vcpu); - if (ret) - return ret; - if (likely(vcpu_has_run_once(vcpu))) return 0; - kvm_arm_vcpu_init_debug(vcpu); + kvm_init_mpidr_data(kvm); if (likely(irqchip_in_kernel(kvm))) { /* @@ -573,39 +858,49 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } - ret = kvm_timer_enable(vcpu); + ret = kvm_finalize_sys_regs(vcpu); if (ret) return ret; - ret = kvm_arm_pmu_v3_enable(vcpu); + if (vcpu_has_nv(vcpu)) { + ret = kvm_vcpu_allocate_vncr_tlb(vcpu); + if (ret) + return ret; + + ret = kvm_vgic_vcpu_nv_init(vcpu); + if (ret) + return ret; + } + + /* + * This needs to happen after any restriction has been applied + * to the feature set. + */ + kvm_calculate_traps(vcpu); + + ret = kvm_timer_enable(vcpu); if (ret) return ret; + if (kvm_vcpu_has_pmu(vcpu)) { + ret = kvm_arm_pmu_v3_enable(vcpu); + if (ret) + return ret; + } + if (is_protected_kvm_enabled()) { ret = pkvm_create_hyp_vm(kvm); if (ret) return ret; - } - if (!irqchip_in_kernel(kvm)) { - /* - * Tell the rest of the code that there are userspace irqchip - * VMs in the wild. - */ - static_branch_inc(&userspace_irqchip_in_use); + ret = pkvm_create_hyp_vcpu(vcpu); + if (ret) + return ret; } - /* - * Initialize traps for protected VMs. - * NOTE: Move to run in EL2 directly, rather than via a hypercall, once - * the code is in place for first run initialization at EL2. - */ - if (kvm_vm_is_protected(kvm)) - kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); - - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); return ret; } @@ -678,15 +973,16 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) * doorbells to be signalled, should an interrupt become pending. */ preempt_disable(); - kvm_vgic_vmcr_sync(vcpu); - vgic_v4_put(vcpu, true); + vcpu_set_flag(vcpu, IN_WFI); + kvm_vgic_put(vcpu); preempt_enable(); kvm_vcpu_halt(vcpu); vcpu_clear_flag(vcpu, IN_WFIT); preempt_disable(); - vgic_v4_load(vcpu); + vcpu_clear_flag(vcpu, IN_WFI); + kvm_vgic_load(vcpu); preempt_enable(); } @@ -735,6 +1031,9 @@ static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) static int check_vcpu_requests(struct kvm_vcpu *vcpu) { if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) kvm_vcpu_sleep(vcpu); @@ -747,26 +1046,34 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) */ kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); + /* Process interrupts deactivated through a trap */ + if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu)) + kvm_vgic_process_async_update(vcpu); + if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) kvm_update_stolen_time(vcpu); if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { /* The distributor enable bits were changed */ preempt_disable(); - vgic_v4_put(vcpu, false); + vgic_v4_put(vcpu); vgic_v4_load(vcpu); preempt_enable(); } if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) - kvm_pmu_handle_pmcr(vcpu, - __vcpu_sys_reg(vcpu, PMCR_EL0)); + kvm_vcpu_reload_pmu(vcpu); + + if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu)) + kvm_vcpu_pmu_restore_guest(vcpu); if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) return kvm_vcpu_suspend(vcpu); if (kvm_dirty_ring_check_request(vcpu)) return 0; + + check_nested_vcpu_requests(vcpu); } return 1; @@ -777,6 +1084,9 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) if (likely(!vcpu_mode_is_32bit(vcpu))) return false; + if (vcpu_has_nv(vcpu)) + return true; + return !kvm_supports_32bit_el0(); } @@ -805,7 +1115,7 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) * state gets updated in kvm_timer_update_run and * kvm_pmu_update_run below). */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { if (kvm_timer_should_notify_user(vcpu) || kvm_pmu_should_notify_user(vcpu)) { *ret = -EINTR; @@ -861,13 +1171,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_MMIO) { ret = kvm_handle_mmio_return(vcpu); - if (ret) + if (ret <= 0) return ret; } vcpu_load(vcpu); - if (run->immediate_exit) { + if (!vcpu->wants_to_run) { ret = -EINTR; goto out; } @@ -881,7 +1191,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (!ret) ret = 1; @@ -895,16 +1205,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ preempt_disable(); - /* - * The VMID allocator only tracks active VMIDs per - * physical CPU, and therefore the VMID allocated may not be - * preserved on VMID roll-over if the task was preempted, - * making a thread's VMID inactive. So we need to call - * kvm_arm_vmid_update() in non-premptible context. - */ - kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid); + kvm_nested_flush_hwstate(vcpu); - kvm_pmu_flush_hwstate(vcpu); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_flush_hwstate(vcpu); local_irq_disable(); @@ -923,8 +1227,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ - kvm_pmu_sync_hwstate(vcpu); - if (static_branch_unlikely(&userspace_irqchip_in_use)) + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_sync_hwstate(vcpu); + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); @@ -932,7 +1237,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) continue; } - kvm_arm_setup_debug(vcpu); kvm_arch_vcpu_ctxflush_fp(vcpu); /************************************************************** @@ -949,14 +1253,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * Back from guest *************************************************************/ - kvm_arm_clear_debug(vcpu); - /* * We must sync the PMU state before the vgic state so * that the vgic can properly sample the updated state of the * interrupt line. */ - kvm_pmu_sync_hwstate(vcpu); + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_sync_hwstate(vcpu); /* * Sync the vgic state before syncing the timer state because @@ -970,9 +1273,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * we don't want vtimer interrupts to race with syncing the * timer virtual interrupt state. */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); + if (is_hyp_ctxt(vcpu)) + kvm_timer_sync_nested(vcpu); + kvm_arch_vcpu_ctxsync_fp(vcpu); /* @@ -1000,6 +1306,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); + kvm_nested_sync_hwstate(vcpu); + preempt_enable(); /* @@ -1017,7 +1325,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * invalid. The VMM can try and fix it by issuing a * KVM_ARM_VCPU_INIT if it really wants to. */ - vcpu->arch.target = -1; + vcpu_clear_flag(vcpu, VCPU_INITIALIZED); ret = ARM_EXCEPTION_IL; } @@ -1086,27 +1394,23 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status) { u32 irq = irq_level->irq; - unsigned int irq_type, vcpu_idx, irq_num; - int nrcpus = atomic_read(&kvm->online_vcpus); + unsigned int irq_type, vcpu_id, irq_num; struct kvm_vcpu *vcpu = NULL; bool level = irq_level->level; irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; - vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; - vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); + vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; + vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; - trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level); + trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level); switch (irq_type) { case KVM_ARM_IRQ_TYPE_CPU: if (irqchip_in_kernel(kvm)) return -ENXIO; - if (vcpu_idx >= nrcpus) - return -EINVAL; - - vcpu = kvm_get_vcpu(kvm, vcpu_idx); + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); if (!vcpu) return -EINVAL; @@ -1118,17 +1422,14 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, if (!irqchip_in_kernel(kvm)) return -ENXIO; - if (vcpu_idx >= nrcpus) - return -EINVAL; - - vcpu = kvm_get_vcpu(kvm, vcpu_idx); + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); if (!vcpu) return -EINVAL; if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) return -EINVAL; - return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL); + return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL); case KVM_ARM_IRQ_TYPE_SPI: if (!irqchip_in_kernel(kvm)) return -ENXIO; @@ -1136,64 +1437,172 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, if (irq_num < VGIC_NR_PRIVATE_IRQS) return -EINVAL; - return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL); + return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL); } return -EINVAL; } -static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, - const struct kvm_vcpu_init *init) +static unsigned long system_supported_vcpu_features(void) +{ + unsigned long features = KVM_VCPU_VALID_FEATURES; + + if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1)) + clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features); + + if (!kvm_supports_guest_pmuv3()) + clear_bit(KVM_ARM_VCPU_PMU_V3, &features); + + if (!system_supports_sve()) + clear_bit(KVM_ARM_VCPU_SVE, &features); + + if (!kvm_has_full_ptr_auth()) { + clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features); + clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features); + } + + if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT)) + clear_bit(KVM_ARM_VCPU_HAS_EL2, &features); + + return features; +} + +static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) { - unsigned int i, ret; - u32 phys_target = kvm_target_cpu(); + unsigned long features = init->features[0]; + int i; + + if (features & ~KVM_VCPU_VALID_FEATURES) + return -ENOENT; - if (init->target != phys_target) + for (i = 1; i < ARRAY_SIZE(init->features); i++) { + if (init->features[i]) + return -ENOENT; + } + + if (features & ~system_supported_vcpu_features()) return -EINVAL; /* - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must - * use the same target. + * For now make sure that both address/generic pointer authentication + * features are requested by the userspace together. */ - if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) + if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) != + test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features)) return -EINVAL; - /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ - for (i = 0; i < sizeof(init->features) * 8; i++) { - bool set = (init->features[i / 32] & (1 << (i % 32))); + if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) + return 0; - if (set && i >= KVM_VCPU_MAX_FEATURES) - return -ENOENT; + /* MTE is incompatible with AArch32 */ + if (kvm_has_mte(vcpu->kvm)) + return -EINVAL; - /* - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must - * use the same feature set. - */ - if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && - test_bit(i, vcpu->arch.features) != set) - return -EINVAL; + /* NV is incompatible with AArch32 */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features)) + return -EINVAL; - if (set) - set_bit(i, vcpu->arch.features); - } + return 0; +} + +static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned long features = init->features[0]; - vcpu->arch.target = phys_target; + return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features, + KVM_VCPU_MAX_FEATURES); +} + +static int kvm_setup_vcpu(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret = 0; + + /* + * When the vCPU has a PMU, but no PMU is set for the guest + * yet, set the default one. + */ + if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu) + ret = kvm_arm_set_default_pmu(kvm); + + /* Prepare for nested if required */ + if (!ret && vcpu_has_nv(vcpu)) + ret = kvm_vcpu_init_nested(vcpu); + + return ret; +} + +static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned long features = init->features[0]; + struct kvm *kvm = vcpu->kvm; + int ret = -EINVAL; + + mutex_lock(&kvm->arch.config_lock); + + if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) && + kvm_vcpu_init_changed(vcpu, init)) + goto out_unlock; + + bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES); + + ret = kvm_setup_vcpu(vcpu); + if (ret) + goto out_unlock; /* Now we know what it is, we can reset it. */ - ret = kvm_reset_vcpu(vcpu); - if (ret) { - vcpu->arch.target = -1; - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); - } + kvm_reset_vcpu(vcpu); + set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags); + vcpu_set_flag(vcpu, VCPU_INITIALIZED); + ret = 0; +out_unlock: + mutex_unlock(&kvm->arch.config_lock); return ret; } +static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + int ret; + + if (init->target != KVM_ARM_TARGET_GENERIC_V8 && + init->target != kvm_target_cpu()) + return -EINVAL; + + ret = kvm_vcpu_init_check_features(vcpu, init); + if (ret) + return ret; + + if (!kvm_vcpu_initialized(vcpu)) + return __kvm_vcpu_set_target(vcpu, init); + + if (kvm_vcpu_init_changed(vcpu, init)) + return -EINVAL; + + kvm_reset_vcpu(vcpu); + return 0; +} + static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) { + bool power_off = false; int ret; + /* + * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid + * reflecting it in the finalized feature set, thus limiting its scope + * to a single KVM_ARM_VCPU_INIT call. + */ + if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) { + init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF); + power_off = true; + } + ret = kvm_vcpu_set_target(vcpu, init); if (ret) return ret; @@ -1215,15 +1624,18 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, } vcpu_reset_hcr(vcpu); - vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; /* * Handle the "start in power-off" case. */ - if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - kvm_arm_vcpu_power_off(vcpu); + spin_lock(&vcpu->arch.mp_state_lock); + + if (power_off) + __kvm_arm_vcpu_power_off(vcpu); else - vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); + + spin_unlock(&vcpu->arch.mp_state_lock); return 0; } @@ -1391,6 +1803,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_GET_VCPU_EVENTS: { struct kvm_vcpu_events events; + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + if (kvm_arm_vcpu_get_events(vcpu, &events)) return -EINVAL; @@ -1402,6 +1817,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_VCPU_EVENTS: { struct kvm_vcpu_events events; + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + if (copy_from_user(&events, argp, sizeof(events))) return -EFAULT; @@ -1425,15 +1843,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } -void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { - + return -ENOIOCTLCMD; } -void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - const struct kvm_memory_slot *memslot) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - kvm_flush_remote_tlbs(kvm); + } static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, @@ -1449,11 +1867,31 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, } } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_ARM_VM_SMCCC_CTRL: + return kvm_vm_smccc_has_attr(kvm, attr); + default: + return -ENXIO; + } +} + +static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_ARM_VM_SMCCC_CTRL: + return kvm_vm_smccc_set_attr(kvm, attr); + default: + return -ENXIO; + } +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; + struct kvm_device_attr attr; switch (ioctl) { case KVM_CREATE_IRQCHIP: { @@ -1473,9 +1911,9 @@ long kvm_arch_vm_ioctl(struct file *filp, return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); } case KVM_ARM_PREFERRED_TARGET: { - struct kvm_vcpu_init init; - - kvm_vcpu_preferred_target(&init); + struct kvm_vcpu_init init = { + .target = KVM_ARM_TARGET_GENERIC_V8, + }; if (copy_to_user(argp, &init, sizeof(init))) return -EFAULT; @@ -1489,6 +1927,32 @@ long kvm_arch_vm_ioctl(struct file *filp, return -EFAULT; return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags); } + case KVM_ARM_SET_COUNTER_OFFSET: { + struct kvm_arm_counter_offset offset; + + if (copy_from_user(&offset, argp, sizeof(offset))) + return -EFAULT; + return kvm_vm_ioctl_set_counter_offset(kvm, &offset); + } + case KVM_HAS_DEVICE_ATTR: { + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + return kvm_vm_has_attr(kvm, &attr); + } + case KVM_SET_DEVICE_ATTR: { + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + return kvm_vm_set_attr(kvm, &attr); + } + case KVM_ARM_GET_REG_WRITABLE_MASKS: { + struct reg_mask_range range; + + if (copy_from_user(&range, argp, sizeof(range))) + return -EFAULT; + return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range); + } default: return -EINVAL; } @@ -1507,6 +1971,11 @@ static unsigned long nvhe_percpu_order(void) return size ? get_order(size) : 0; } +static size_t pkvm_host_sve_state_order(void) +{ + return get_order(pkvm_host_sve_state_size()); +} + /* A lookup table holding the hypervisor VA for each vector slot */ static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; @@ -1539,7 +2008,7 @@ static int kvm_init_vector_slots(void) return 0; } -static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) +static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); unsigned long tcr; @@ -1555,8 +2024,18 @@ static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) params->mair_el2 = read_sysreg(mair_el1); - tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1; - tcr &= ~TCR_T0SZ_MASK; + tcr = read_sysreg(tcr_el1); + if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK); + tcr |= TCR_EPD1_MASK; + } else { + unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr); + + tcr &= TCR_EL2_MASK; + tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips); + if (lpa2_is_enabled()) + tcr |= TCR_EL2_DS; + } tcr |= TCR_T0SZ(hyp_va_bits); params->tcr_el2 = tcr; @@ -1565,6 +2044,8 @@ static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; else params->hcr_el2 = HCR_HOST_NVHE_FLAGS; + if (cpus_have_final_cap(ARM64_KVM_HVHE)) + params->hcr_el2 |= HCR_E2H; params->vttbr = params->vtcr = 0; /* @@ -1586,7 +2067,7 @@ static void hyp_install_host_vector(void) * Call initialization code, and switch to the full blown HYP code. * If the cpucaps haven't been finalized yet, something has gone very * wrong, and hyp will crash and burn when it uses any - * cpus_have_const_cap() wrapper. + * cpus_have_*_cap() wrapper. */ BUG_ON(!system_capabilities_finalized()); params = this_cpu_ptr_nvhe_sym(kvm_init_params); @@ -1647,7 +2128,8 @@ static void cpu_set_hyp_vector(void) static void cpu_hyp_init_context(void) { - kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); + kvm_init_host_cpu_context(host_data_ptr(host_ctxt)); + kvm_init_host_debug_data(); if (!is_kernel_in_hyp_mode()) cpu_init_hyp_mode(); @@ -1656,10 +2138,11 @@ static void cpu_hyp_init_context(void) static void cpu_hyp_init_features(void) { cpu_set_hyp_vector(); - kvm_arm_init_debug(); - if (is_kernel_in_hyp_mode()) + if (is_kernel_in_hyp_mode()) { kvm_timer_init_vhe(); + kvm_debug_init_vhe(); + } if (vgic_present) kvm_vgic_init_cpu_hardware(); @@ -1672,32 +2155,49 @@ static void cpu_hyp_reinit(void) cpu_hyp_init_features(); } -static void _kvm_arch_hardware_enable(void *discard) +static void cpu_hyp_init(void *discard) { - if (!__this_cpu_read(kvm_arm_hardware_enabled)) { + if (!__this_cpu_read(kvm_hyp_initialized)) { cpu_hyp_reinit(); - __this_cpu_write(kvm_arm_hardware_enabled, 1); + __this_cpu_write(kvm_hyp_initialized, 1); } } -int kvm_arch_hardware_enable(void) +static void cpu_hyp_uninit(void *discard) { - _kvm_arch_hardware_enable(NULL); - return 0; + if (!is_protected_kvm_enabled() && __this_cpu_read(kvm_hyp_initialized)) { + cpu_hyp_reset(); + __this_cpu_write(kvm_hyp_initialized, 0); + } } -static void _kvm_arch_hardware_disable(void *discard) +int kvm_arch_enable_virtualization_cpu(void) { - if (__this_cpu_read(kvm_arm_hardware_enabled)) { - cpu_hyp_reset(); - __this_cpu_write(kvm_arm_hardware_enabled, 0); - } + /* + * Most calls to this function are made with migration + * disabled, but not with preemption disabled. The former is + * enough to ensure correctness, but most of the helpers + * expect the later and will throw a tantrum otherwise. + */ + preempt_disable(); + + cpu_hyp_init(NULL); + + kvm_vgic_cpu_up(); + kvm_timer_cpu_up(); + + preempt_enable(); + + return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { + kvm_timer_cpu_down(); + kvm_vgic_cpu_down(); + if (!is_protected_kvm_enabled()) - _kvm_arch_hardware_disable(NULL); + cpu_hyp_uninit(NULL); } #ifdef CONFIG_CPU_PM @@ -1706,16 +2206,16 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self, void *v) { /* - * kvm_arm_hardware_enabled is left with its old value over + * kvm_hyp_initialized is left with its old value over * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should * re-enable hyp. */ switch (cmd) { case CPU_PM_ENTER: - if (__this_cpu_read(kvm_arm_hardware_enabled)) + if (__this_cpu_read(kvm_hyp_initialized)) /* - * don't update kvm_arm_hardware_enabled here - * so that the hardware will be re-enabled + * don't update kvm_hyp_initialized here + * so that the hyp will be re-enabled * when we resume. See below. */ cpu_hyp_reset(); @@ -1723,8 +2223,8 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self, return NOTIFY_OK; case CPU_PM_ENTER_FAILED: case CPU_PM_EXIT: - if (__this_cpu_read(kvm_arm_hardware_enabled)) - /* The hardware was enabled before suspend. */ + if (__this_cpu_read(kvm_hyp_initialized)) + /* The hyp was enabled before suspend. */ cpu_hyp_reinit(); return NOTIFY_OK; @@ -1738,26 +2238,26 @@ static struct notifier_block hyp_init_cpu_pm_nb = { .notifier_call = hyp_init_cpu_pm_notifier, }; -static void hyp_cpu_pm_init(void) +static void __init hyp_cpu_pm_init(void) { if (!is_protected_kvm_enabled()) cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); } -static void hyp_cpu_pm_exit(void) +static void __init hyp_cpu_pm_exit(void) { if (!is_protected_kvm_enabled()) cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); } #else -static inline void hyp_cpu_pm_init(void) +static inline void __init hyp_cpu_pm_init(void) { } -static inline void hyp_cpu_pm_exit(void) +static inline void __init hyp_cpu_pm_exit(void) { } #endif -static void init_cpu_logical_map(void) +static void __init init_cpu_logical_map(void) { unsigned int cpu; @@ -1774,7 +2274,7 @@ static void init_cpu_logical_map(void) #define init_psci_0_1_impl_state(config, what) \ config.psci_0_1_ ## what ## _implemented = psci_ops.what -static bool init_psci_relay(void) +static bool __init init_psci_relay(void) { /* * If PSCI has not been initialized, protected KVM cannot install @@ -1786,6 +2286,7 @@ static bool init_psci_relay(void) } kvm_host_psci_config.version = psci_ops.get_version(); + kvm_host_psci_config.smccc_version = arm_smccc_get_version(); if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); @@ -1797,14 +2298,14 @@ static bool init_psci_relay(void) return true; } -static int init_subsystems(void) +static int __init init_subsystems(void) { int err = 0; /* * Enable hardware so that subsystem initialisation can access EL2. */ - on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); + on_each_cpu(cpu_hyp_init, NULL, 1); /* * Register CPU lower-power notifier @@ -1821,6 +2322,19 @@ static int init_subsystems(void) break; case -ENODEV: case -ENXIO: + /* + * No VGIC? No pKVM for you. + * + * Protected mode assumes that VGICv3 is present, so no point + * in trying to hobble along if vgic initialization fails. + */ + if (is_protected_kvm_enabled()) + goto out; + + /* + * Otherwise, userspace could choose to implement a GIC for its + * guest on non-cooperative hardware. + */ vgic_present = false; err = 0; break; @@ -1828,6 +2342,14 @@ static int init_subsystems(void) goto out; } + if (kvm_mode == KVM_MODE_NV && + !(vgic_present && (kvm_vgic_global_state.type == VGIC_V3 || + kvm_vgic_global_state.has_gcie_v3_compat))) { + kvm_err("NV support requires GICv3 or GICv5 with legacy support, giving up\n"); + err = -EINVAL; + goto out; + } + /* * Init HYP architected timer support */ @@ -1838,24 +2360,49 @@ static int init_subsystems(void) kvm_register_perf_callbacks(NULL); out: + if (err) + hyp_cpu_pm_exit(); + if (err || !is_protected_kvm_enabled()) - on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); + on_each_cpu(cpu_hyp_uninit, NULL, 1); return err; } -static void teardown_hyp_mode(void) +static void __init teardown_subsystems(void) +{ + kvm_unregister_perf_callbacks(); + hyp_cpu_pm_exit(); +} + +static void __init teardown_hyp_mode(void) { + bool free_sve = system_supports_sve() && is_protected_kvm_enabled(); int cpu; free_hyp_pgds(); for_each_possible_cpu(cpu) { - free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); + if (per_cpu(kvm_hyp_initialized, cpu)) + continue; + + free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT); + + if (!kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]) + continue; + + if (free_sve) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + free_pages((unsigned long) sve_state, pkvm_host_sve_state_order()); + } + free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); + } } -static int do_pkvm_init(u32 hyp_va_bits) +static int __init do_pkvm_init(u32 hyp_va_bits) { void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); int ret; @@ -1869,17 +2416,41 @@ static int do_pkvm_init(u32 hyp_va_bits) /* * The stub hypercalls are now disabled, so set our local flag to - * prevent a later re-init attempt in kvm_arch_hardware_enable(). + * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu(). */ - __this_cpu_write(kvm_arm_hardware_enabled, 1); + __this_cpu_write(kvm_hyp_initialized, 1); preempt_enable(); return ret; } +static u64 get_hyp_id_aa64pfr0_el1(void) +{ + /* + * Track whether the system isn't affected by spectre/meltdown in the + * hypervisor's view of id_aa64pfr0_el1, used for protected VMs. + * Although this is per-CPU, we make it global for simplicity, e.g., not + * to have to worry about vcpu migration. + * + * Unlike for non-protected VMs, userspace cannot override this for + * protected VMs. + */ + u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + val &= ~(ID_AA64PFR0_EL1_CSV2 | + ID_AA64PFR0_EL1_CSV3); + + val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV2, + arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED); + val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV3, + arm64_get_meltdown_state() == SPECTRE_UNAFFECTED); + + return val; +} + static void kvm_hyp_init_symbols(void) { - kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1(); kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); @@ -1887,11 +2458,32 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); kvm_nvhe_sym(__icache_flags) = __icache_flags; kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; + + /* Propagate the FGT state to the nVHE side */ + kvm_nvhe_sym(hfgrtr_masks) = hfgrtr_masks; + kvm_nvhe_sym(hfgwtr_masks) = hfgwtr_masks; + kvm_nvhe_sym(hfgitr_masks) = hfgitr_masks; + kvm_nvhe_sym(hdfgrtr_masks) = hdfgrtr_masks; + kvm_nvhe_sym(hdfgwtr_masks) = hdfgwtr_masks; + kvm_nvhe_sym(hafgrtr_masks) = hafgrtr_masks; + kvm_nvhe_sym(hfgrtr2_masks) = hfgrtr2_masks; + kvm_nvhe_sym(hfgwtr2_masks) = hfgwtr2_masks; + kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks; + kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks; + kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks; + + /* + * Flush entire BSS since part of its data containing init symbols is read + * while the MMU is off. + */ + kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start), + kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start)); } -static int kvm_hyp_init_protection(u32 hyp_va_bits) +static int __init kvm_hyp_init_protection(u32 hyp_va_bits) { void *addr = phys_to_virt(hyp_mem_base); int ret; @@ -1909,10 +2501,72 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits) return 0; } -/** - * Inits Hyp-mode on all online CPUs +static int init_pkvm_host_sve_state(void) +{ + int cpu; + + if (!system_supports_sve()) + return 0; + + /* Allocate pages for host sve state in protected mode. */ + for_each_possible_cpu(cpu) { + struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order()); + + if (!page) + return -ENOMEM; + + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page); + } + + /* + * Don't map the pages in hyp since these are only used in protected + * mode, which will (re)create its own mapping when initialized. + */ + + return 0; +} + +/* + * Finalizes the initialization of hyp mode, once everything else is initialized + * and the initialziation process cannot fail. */ -static int init_hyp_mode(void) +static void finalize_init_hyp_mode(void) +{ + int cpu; + + if (system_supports_sve() && is_protected_kvm_enabled()) { + for_each_possible_cpu(cpu) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = + kern_hyp_va(sve_state); + } + } +} + +static void pkvm_hyp_init_ptrauth(void) +{ + struct kvm_cpu_context *hyp_ctxt; + int cpu; + + for_each_possible_cpu(cpu) { + hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu); + hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long(); + } +} + +/* Inits Hyp-mode on all online CPUs */ +static int __init init_hyp_mode(void) { u32 hyp_va_bits; int cpu; @@ -1936,15 +2590,15 @@ static int init_hyp_mode(void) * Allocate stack pages for Hypervisor-mode */ for_each_possible_cpu(cpu) { - unsigned long stack_page; + unsigned long stack_base; - stack_page = __get_free_page(GFP_KERNEL); - if (!stack_page) { + stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT); + if (!stack_base) { err = -ENOMEM; goto out_err; } - per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; + per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base; } /* @@ -1975,6 +2629,13 @@ static int init_hyp_mode(void) goto out_err; } + err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start), + kvm_ksym_ref(__hyp_data_end), PAGE_HYP); + if (err) { + kvm_err("Cannot map .hyp.data section\n"); + goto out_err; + } + err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start), kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO); if (err) { @@ -2013,31 +2674,9 @@ static int init_hyp_mode(void) */ for_each_possible_cpu(cpu) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); - char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); - unsigned long hyp_addr; + char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu); - /* - * Allocate a contiguous HYP private VA range for the stack - * and guard page. The allocation is also aligned based on - * the order of its size. - */ - err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); - if (err) { - kvm_err("Cannot allocate hyp stack guard page\n"); - goto out_err; - } - - /* - * Since the stack grows downwards, map the stack to the page - * at the higher address and leave the lower guard page - * unbacked. - * - * Any valid stack address now has the PAGE_SHIFT bit as 1 - * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. - */ - err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE, - __pa(stack_page), PAGE_HYP); + err = create_hyp_stack(__pa(stack_base), ¶ms->stack_hyp_va); if (err) { kvm_err("Cannot map hyp stack\n"); goto out_err; @@ -2049,9 +2688,7 @@ static int init_hyp_mode(void) * __hyp_pa() won't do the right thing there, since the stack * has been mapped in the flexible private VA space. */ - params->stack_pa = __pa(stack_page); - - params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); + params->stack_pa = __pa(stack_base); } for_each_possible_cpu(cpu) { @@ -2072,6 +2709,10 @@ static int init_hyp_mode(void) kvm_hyp_init_symbols(); if (is_protected_kvm_enabled()) { + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && + cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH)) + pkvm_hyp_init_ptrauth(); + init_cpu_logical_map(); if (!init_psci_relay()) { @@ -2079,6 +2720,10 @@ static int init_hyp_mode(void) goto out_err; } + err = init_pkvm_host_sve_state(); + if (err) + goto out_err; + err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); @@ -2094,47 +2739,30 @@ out_err: return err; } -static void _kvm_host_prot_finalize(void *arg) +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { - int *err = arg; + struct kvm_vcpu *vcpu = NULL; + struct kvm_mpidr_data *data; + unsigned long i; - if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) - WRITE_ONCE(*err, -EINVAL); -} + mpidr &= MPIDR_HWID_BITMASK; -static int pkvm_drop_host_privileges(void) -{ - int ret = 0; + rcu_read_lock(); + data = rcu_dereference(kvm->arch.mpidr_data); - /* - * Flip the static key upfront as that may no longer be possible - * once the host stage 2 is installed. - */ - static_branch_enable(&kvm_protected_mode_initialized); - on_each_cpu(_kvm_host_prot_finalize, &ret, 1); - return ret; -} + if (data) { + u16 idx = kvm_mpidr_index(data, mpidr); -static int finalize_hyp_mode(void) -{ - if (!is_protected_kvm_enabled()) - return 0; + vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]); + if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu)) + vcpu = NULL; + } - /* - * Exclude HYP sections from kmemleak so that they don't get peeked - * at, which would end badly once inaccessible. - */ - kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); - kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); - return pkvm_drop_host_privileges(); -} + rcu_read_unlock(); -struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) -{ - struct kvm_vcpu *vcpu; - unsigned long i; + if (vcpu) + return vcpu; - mpidr &= MPIDR_HWID_BITMASK; kvm_for_each_vcpu(i, vcpu, kvm) { if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) return vcpu; @@ -2147,28 +2775,54 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } -bool kvm_arch_has_irq_bypass(void) -{ - return true; -} - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; + + /* + * The only thing we have a chance of directly-injecting is LPIs. Maybe + * one day... + */ + if (irq_entry->type != KVM_IRQ_ROUTING_MSI) + return 0; return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, &irqfd->irq_entry); } + void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; + + if (irq_entry->type != KVM_IRQ_ROUTING_MSI) + return; + + kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq); +} + +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + if (old->type == KVM_IRQ_ROUTING_MSI && + new->type == KVM_IRQ_ROUTING_MSI && + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) + return; - kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, - &irqfd->irq_entry); + /* + * Remapping the vLPI requires taking the its_lock mutex to resolve + * the new translation. We're in spinlock land at this point, so no + * chance of resolving the translation. + * + * Unmap the vLPI and fall back to software LPI injection. + */ + return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq); } void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) @@ -2187,10 +2841,8 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) kvm_arm_resume_guest(irqfd->kvm); } -/** - * Initialize Hyp-mode and memory mappings on all CPUs. - */ -int kvm_arch_init(void *opaque) +/* Initialize Hyp-mode and memory mappings on all CPUs */ +static __init int kvm_arm_init(void) { int err; bool in_hyp_mode; @@ -2241,33 +2893,42 @@ int kvm_arch_init(void *opaque) err = kvm_init_vector_slots(); if (err) { kvm_err("Cannot initialise vector slots\n"); - goto out_err; + goto out_hyp; } err = init_subsystems(); if (err) goto out_hyp; - if (!in_hyp_mode) { - err = finalize_hyp_mode(); - if (err) { - kvm_err("Failed to finalize Hyp protection\n"); - goto out_hyp; - } - } + kvm_info("%s%sVHE%s mode initialized successfully\n", + in_hyp_mode ? "" : (is_protected_kvm_enabled() ? + "Protected " : "Hyp "), + in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ? + "h" : "n"), + cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": ""); - if (is_protected_kvm_enabled()) { - kvm_info("Protected nVHE mode initialized successfully\n"); - } else if (in_hyp_mode) { - kvm_info("VHE mode initialized successfully\n"); - } else { - kvm_info("Hyp mode initialized successfully\n"); - } + /* + * FIXME: Do something reasonable if kvm_init() fails after pKVM + * hypervisor protection is finalized. + */ + err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (err) + goto out_subs; + + /* + * This should be called after initialization is done and failure isn't + * possible anymore. + */ + if (!in_hyp_mode) + finalize_init_hyp_mode(); + + kvm_arm_initialised = true; return 0; +out_subs: + teardown_subsystems(); out_hyp: - hyp_cpu_pm_exit(); if (!in_hyp_mode) teardown_hyp_mode(); out_err: @@ -2275,12 +2936,6 @@ out_err: return err; } -/* NOP: Compiling as a module not supported */ -void kvm_arch_exit(void) -{ - kvm_unregister_perf_callbacks(); -} - static int __init early_kvm_mode_cfg(char *arg) { if (!arg) @@ -2310,19 +2965,48 @@ static int __init early_kvm_mode_cfg(char *arg) return 0; } + if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) { + kvm_mode = KVM_MODE_NV; + return 0; + } + return -EINVAL; } early_param("kvm-arm.mode", early_kvm_mode_cfg); -enum kvm_mode kvm_get_mode(void) +static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p) { - return kvm_mode; + if (!arg) + return -EINVAL; + + if (strcmp(arg, "trap") == 0) { + *p = KVM_WFX_TRAP; + return 0; + } + + if (strcmp(arg, "notrap") == 0) { + *p = KVM_WFX_NOTRAP; + return 0; + } + + return -EINVAL; } -static int arm_init(void) +static int __init early_kvm_wfi_trap_policy_cfg(char *arg) { - int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); - return rc; + return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy); +} +early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg); + +static int __init early_kvm_wfe_trap_policy_cfg(char *arg) +{ + return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy); +} +early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg); + +enum kvm_mode kvm_get_mode(void) +{ + return kvm_mode; } -module_init(arm_init); +module_init(kvm_arm_init); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c new file mode 100644 index 000000000000..53bf70126f81 --- /dev/null +++ b/arch/arm64/kvm/at.c @@ -0,0 +1,1795 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Linaro Ltd + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm_host.h> + +#include <asm/esr.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) +{ + wr->fst = fst; + wr->ptw = s1ptw; + wr->s2 = s1ptw; + wr->failed = true; +} + +#define S1_MMU_DISABLED (-127) + +static int get_ia_size(struct s1_walk_info *wi) +{ + return 64 - wi->txsz; +} + +/* Return true if the IPA is out of the OA range */ +static bool check_output_size(u64 ipa, struct s1_walk_info *wi) +{ + if (wi->pa52bit) + return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); + return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); +} + +static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) +{ + switch (BIT(wi->pgshift)) { + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) + return false; + return ((wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : + FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); + case SZ_16K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) + return false; + break; + case SZ_4K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) + return false; + break; + } + + return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); +} + +static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) +{ + u64 addr; + + if (!wi->pa52bit) + return desc & GENMASK_ULL(47, wi->pgshift); + + switch (BIT(wi->pgshift)) { + case SZ_4K: + case SZ_16K: + addr = desc & GENMASK_ULL(49, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; + break; + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + addr = desc & GENMASK_ULL(47, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; + break; + } + + return addr; +} + +/* Return the translation regime that applies to an AT instruction */ +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) +{ + /* + * We only get here from guest EL2, so the translation + * regime AT applies to is solely defined by {E2H,TGE}. + */ + switch (op) { + case OP_AT_S1E2R: + case OP_AT_S1E2W: + case OP_AT_S1E2A: + return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + default: + return (vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; + } +} + +static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (regime == TR_EL10) { + if (vcpu_has_nv(vcpu) && + !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) + return 0; + + return vcpu_read_sys_reg(vcpu, TCR2_EL1); + } + + return vcpu_read_sys_reg(vcpu, TCR2_EL2); +} + +static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (!kvm_has_s1pie(vcpu->kvm)) + return false; + + /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ + return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; +} + +static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) +{ + u64 val; + + if (!kvm_has_s1poe(vcpu->kvm)) { + wi->poe = wi->e0poe = false; + return; + } + + val = effective_tcr2(vcpu, wi->regime); + + /* Abuse TCR2_EL1_* for EL2 */ + wi->poe = val & TCR2_EL1_POE; + wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); +} + +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; + unsigned int stride, x; + bool va55, tbi, lva; + + va55 = va & BIT(55); + + if (vcpu_has_nv(vcpu)) { + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + } else { + WARN_ON_ONCE(wi->regime != TR_EL10); + wi->s2 = false; + hcr = 0; + } + + switch (wi->regime) { + case TR_EL10: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL2: + case TR_EL20: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55 && wi->regime != TR_EL2) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); + + ia_bits = get_ia_size(wi); + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + + tbi = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_TBI, tcr) : + (va55 ? + FIELD_GET(TCR_TBI1, tcr) : + FIELD_GET(TCR_TBI0, tcr))); + + if (!tbi && (u64)sign_extend64(va, 55) != va) + goto addrsz; + + wi->sh = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_SH0_MASK, tcr) : + (va55 ? + FIELD_GET(TCR_SH1_MASK, tcr) : + FIELD_GET(TCR_SH0_MASK, tcr))); + + va = (u64)sign_extend64(va, 55); + + /* Let's put the MMU disabled case aside immediately */ + switch (wi->regime) { + case TR_EL10: + /* + * If dealing with the EL1&0 translation regime, 3 things + * can disable the S1 translation: + * + * - HCR_EL2.DC = 1 + * - HCR_EL2.{E2H,TGE} = {0,1} + * - SCTLR_EL1.M = 0 + * + * The TGE part is interesting. If we have decided that this + * is EL1&0, then it means that either {E2H,TGE} == {1,0} or + * {0,x}, and we only need to test for TGE == 1. + */ + if (hcr & (HCR_DC | HCR_TGE)) { + wr->level = S1_MMU_DISABLED; + break; + } + fallthrough; + case TR_EL2: + case TR_EL20: + if (!(sctlr & SCTLR_ELx_M)) + wr->level = S1_MMU_DISABLED; + break; + } + + if (wr->level == S1_MMU_DISABLED) { + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) + goto addrsz; + + wr->pa = va; + return 0; + } + + wi->be = sctlr & SCTLR_ELx_EE; + + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); + wi->hpd &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HPD, tcr) : + (va55 ? + FIELD_GET(TCR_HPD1, tcr) : + FIELD_GET(TCR_HPD0, tcr))); + /* R_JHSVW */ + wi->hpd |= s1pie_enabled(vcpu, wi->regime); + + /* Do we have POE? */ + compute_s1poe(vcpu, wi); + + /* R_BVXDG */ + wi->hpd |= (wi->poe || wi->e0poe); + + /* R_PLCGL, R_YXNYW */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { + if (wi->txsz > 39) + goto transfault; + } else { + if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) + goto transfault; + } + + /* R_GTJBY, R_SXWGM */ + switch (BIT(wi->pgshift)) { + case SZ_4K: + case SZ_16K: + lva = wi->pa52bit; + break; + case SZ_64K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); + break; + } + + if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) + goto transfault; + + /* R_YYVYV, I_THCZK */ + if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || + (va55 && va < GENMASK(63, ia_bits))) + goto transfault; + + /* I_ZFSYQ */ + if (wi->regime != TR_EL2 && + (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) + goto transfault; + + /* R_BNDVG and following statements */ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && + wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + goto transfault; + + ps = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); + + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); + + /* Compute minimal alignment */ + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); + + wi->baddr = ttbr & TTBRx_EL1_BADDR; + if (wi->pa52bit) { + /* + * Force the alignment on 64 bytes for top-level tables + * smaller than 8 entries, since TTBR.BADDR[5:2] are used to + * store bits [51:48] of the first level of lookup. + */ + x = max(x, 6); + + wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; + } + + /* R_VPBBF */ + if (check_output_size(wi->baddr, wi)) + goto addrsz; + + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + + wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); + wi->ha &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HA, tcr) : + FIELD_GET(TCR_HA, tcr)); + + return 0; + +addrsz: + /* + * Address Size Fault level 0 to indicate it comes from TTBR. + * yes, this is an oddity. + */ + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); + return -EFAULT; + +transfault: + /* Translation Fault on start level */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); + return -EFAULT; +} + +static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, + struct s1_walk_info *wi) +{ + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; +} + +static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, + struct s1_walk_info *wi) +{ + if (wi->be) { + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); + } else { + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 va_top, va_bottom, baddr, desc, new_desc, ipa; + int level, stride, ret; + + level = wi->sl; + stride = wi->pgshift - 3; + baddr = wi->baddr; + + va_top = get_ia_size(wi) - 1; + + while (1) { + u64 index; + + va_bottom = (3 - level) * stride + wi->pgshift; + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); + + ipa = baddr | index; + + if (wi->s2) { + struct kvm_s2_trans s2_trans = {}; + + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); + if (ret) { + fail_s1_walk(wr, + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, + true); + return ret; + } + + if (!kvm_s2_trans_readable(&s2_trans)) { + fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), + true); + + return -EPERM; + } + + ipa = kvm_s2_trans_output(&s2_trans); + } + + if (wi->filter) { + ret = wi->filter->fn(&(struct s1_walk_context) + { + .wi = wi, + .table_ipa = baddr, + .level = level, + }, wi->filter->priv); + if (ret) + return ret; + } + + ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); + if (ret) { + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); + return ret; + } + + new_desc = desc; + + /* Invalid descriptor */ + if (!(desc & BIT(0))) + goto transfault; + + /* Block mapping, check validity down the line */ + if (!(desc & BIT(1))) + break; + + /* Page mapping */ + if (level == 3) + break; + + /* Table handling */ + if (!wi->hpd) { + wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); + } + + baddr = desc_to_oa(wi, desc); + + /* Check for out-of-range OA */ + if (check_output_size(baddr, wi)) + goto addrsz; + + /* Prepare for next round */ + va_top = va_bottom - 1; + level++; + } + + /* Block mapping, check the validity of the level */ + if (!(desc & BIT(1))) { + bool valid_block = false; + + switch (BIT(wi->pgshift)) { + case SZ_4K: + valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); + break; + case SZ_16K: + case SZ_64K: + valid_block = level == 2 || (wi->pa52bit && level == 1); + break; + } + + if (!valid_block) + goto transfault; + } + + baddr = desc_to_oa(wi, desc); + if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) + goto addrsz; + + if (wi->ha) + new_desc |= PTE_AF; + + if (new_desc != desc) { + ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + + if (!(desc & PTE_AF)) { + fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); + return -EACCES; + } + + va_bottom += contiguous_bit_shift(desc, wi, level); + + wr->failed = false; + wr->level = level; + wr->desc = desc; + wr->pa = baddr & GENMASK(52, va_bottom); + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + + wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); + if (wr->nG) { + u64 asid_ttbr, tcr; + + switch (wi->regime) { + case TR_EL10: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL20: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + wr->asid &= GENMASK(7, 0); + } + + return 0; + +addrsz: + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false); + return -EINVAL; +transfault: + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false); + return -ENOENT; +} + +struct mmu_config { + u64 ttbr0; + u64 ttbr1; + u64 tcr; + u64 mair; + u64 tcr2; + u64 pir; + u64 pire0; + u64 por_el0; + u64 por_el1; + u64 sctlr; + u64 vttbr; + u64 vtcr; +}; + +static void __mmu_config_save(struct mmu_config *config) +{ + config->ttbr0 = read_sysreg_el1(SYS_TTBR0); + config->ttbr1 = read_sysreg_el1(SYS_TTBR1); + config->tcr = read_sysreg_el1(SYS_TCR); + config->mair = read_sysreg_el1(SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + config->tcr2 = read_sysreg_el1(SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + config->pir = read_sysreg_el1(SYS_PIR); + config->pire0 = read_sysreg_el1(SYS_PIRE0); + } + if (system_supports_poe()) { + config->por_el1 = read_sysreg_el1(SYS_POR); + config->por_el0 = read_sysreg_s(SYS_POR_EL0); + } + } + config->sctlr = read_sysreg_el1(SYS_SCTLR); + config->vttbr = read_sysreg(vttbr_el2); + config->vtcr = read_sysreg(vtcr_el2); +} + +static void __mmu_config_restore(struct mmu_config *config) +{ + /* + * ARM errata 1165522 and 1530923 require TGE to be 1 before + * we update the guest state. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + + write_sysreg_el1(config->ttbr0, SYS_TTBR0); + write_sysreg_el1(config->ttbr1, SYS_TTBR1); + write_sysreg_el1(config->tcr, SYS_TCR); + write_sysreg_el1(config->mair, SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + write_sysreg_el1(config->tcr2, SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + write_sysreg_el1(config->pir, SYS_PIR); + write_sysreg_el1(config->pire0, SYS_PIRE0); + } + if (system_supports_poe()) { + write_sysreg_el1(config->por_el1, SYS_POR); + write_sysreg_s(config->por_el0, SYS_POR_EL0); + } + } + write_sysreg_el1(config->sctlr, SYS_SCTLR); + write_sysreg(config->vttbr, vttbr_el2); + write_sysreg(config->vtcr, vtcr_el2); +} + +static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 host_pan; + bool fail; + + host_pan = read_sysreg_s(SYS_PSTATE_PAN); + write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); + + switch (op) { + case OP_AT_S1E1RP: + fail = __kvm_at(OP_AT_S1E1RP, vaddr); + break; + case OP_AT_S1E1WP: + fail = __kvm_at(OP_AT_S1E1WP, vaddr); + break; + } + + write_sysreg_s(host_pan, SYS_PSTATE_PAN); + + return fail; +} + +#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) +#define MEMATTR_NC 0b0100 +#define MEMATTR_Wt 0b1000 +#define MEMATTR_Wb 0b1100 +#define MEMATTR_WbRaWa 0b1111 + +#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) + +static u8 s2_memattr_to_attr(u8 memattr) +{ + memattr &= 0b1111; + + switch (memattr) { + case 0b0000: + case 0b0001: + case 0b0010: + case 0b0011: + return memattr << 2; + case 0b0100: + return MEMATTR(Wb, Wb); + case 0b0101: + return MEMATTR(NC, NC); + case 0b0110: + return MEMATTR(Wt, NC); + case 0b0111: + return MEMATTR(Wb, NC); + case 0b1000: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1001: + return MEMATTR(NC, Wt); + case 0b1010: + return MEMATTR(Wt, Wt); + case 0b1011: + return MEMATTR(Wb, Wt); + case 0b1100: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1101: + return MEMATTR(NC, Wb); + case 0b1110: + return MEMATTR(Wt, Wb); + case 0b1111: + return MEMATTR(Wb, Wb); + default: + unreachable(); + } +} + +static u8 combine_s1_s2_attr(u8 s1, u8 s2) +{ + bool transient; + u8 final = 0; + + /* Upgrade transient s1 to non-transient to simplify things */ + switch (s1) { + case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ + transient = true; + s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); + break; + case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ + transient = true; + s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); + break; + default: + transient = false; + } + + /* S2CombineS1AttrHints() */ + if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || + (s2 & GENMASK(3, 2)) == MEMATTR_NC) + final = MEMATTR_NC; + else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || + (s2 & GENMASK(3, 2)) == MEMATTR_Wt) + final = MEMATTR_Wt; + else + final = MEMATTR_Wb; + + if (final != MEMATTR_NC) { + /* Inherit RaWa hints form S1 */ + if (transient) { + switch (s1 & GENMASK(3, 2)) { + case MEMATTR_Wt: + final = 0; + break; + case MEMATTR_Wb: + final = MEMATTR_NC; + break; + } + } + + final |= s1 & GENMASK(1, 0); + } + + return final; +} + +#define ATTR_NSH 0b00 +#define ATTR_RSV 0b01 +#define ATTR_OSH 0b10 +#define ATTR_ISH 0b11 + +static u8 compute_final_sh(u8 attr, u8 sh) +{ + /* Any form of device, as well as NC has SH[1:0]=0b10 */ + if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) + return ATTR_OSH; + + if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ + sh = ATTR_NSH; + + return sh; +} + +static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, + u8 attr) +{ + u8 sh; + + /* + * non-52bit and LPA have their basic shareability described in the + * descriptor. LPA2 gets it from the corresponding field in TCR, + * conveniently recorded in the walk info. + */ + if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) + sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); + else + sh = wi->sh; + + return compute_final_sh(attr, sh); +} + +static u8 combine_sh(u8 s1_sh, u8 s2_sh) +{ + if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) + return ATTR_OSH; + if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) + return ATTR_ISH; + + return ATTR_NSH; +} + +static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, + struct kvm_s2_trans *tr) +{ + u8 s1_parattr, s2_memattr, final_attr, s2_sh; + u64 par; + + /* If S2 has failed to translate, report the damage */ + if (tr->esr) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= SYS_PAR_EL1_S; + par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); + return par; + } + + s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); + s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { + if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) + s2_memattr &= ~BIT(3); + + /* Combination of R_VRJSW and R_RHWZM */ + switch (s2_memattr) { + case 0b0101: + if (MEMATTR_IS_DEVICE(s1_parattr)) + final_attr = s1_parattr; + else + final_attr = MEMATTR(NC, NC); + break; + case 0b0110: + case 0b1110: + final_attr = MEMATTR(WbRaWa, WbRaWa); + break; + case 0b0111: + case 0b1111: + /* Preserve S1 attribute */ + final_attr = s1_parattr; + break; + case 0b0100: + case 0b1100: + case 0b1101: + /* Reserved, do something non-silly */ + final_attr = s1_parattr; + break; + default: + /* + * MemAttr[2]=0, Device from S2. + * + * FWB does not influence the way that stage 1 + * memory types and attributes are combined + * with stage 2 Device type and attributes. + */ + final_attr = min(s2_memattr_to_attr(s2_memattr), + s1_parattr); + } + } else { + /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ + u8 s2_parattr = s2_memattr_to_attr(s2_memattr); + + if (MEMATTR_IS_DEVICE(s1_parattr) || + MEMATTR_IS_DEVICE(s2_parattr)) { + final_attr = min(s1_parattr, s2_parattr); + } else { + /* At this stage, this is memory vs memory */ + final_attr = combine_s1_s2_attr(s1_parattr & 0xf, + s2_parattr & 0xf); + final_attr |= combine_s1_s2_attr(s1_parattr >> 4, + s2_parattr >> 4) << 4; + } + } + + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && + !MEMATTR_IS_DEVICE(final_attr)) + final_attr = MEMATTR(NC, NC); + + s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); + + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); + par |= tr->output & GENMASK(47, 12); + par |= FIELD_PREP(SYS_PAR_EL1_SH, + combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), + compute_final_sh(final_attr, s2_sh))); + + return par; +} + +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u64 par; + + if (wr->failed) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; + par |= wr->s2 ? SYS_PAR_EL1_S : 0; + } else if (wr->level == S1_MMU_DISABLED) { + /* MMU off or HCR_EL2.DC == 1 */ + par = SYS_PAR_EL1_NSE; + par |= wr->pa & SYS_PAR_EL1_PA; + + if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, + MEMATTR(WbRaWa, WbRaWa)); + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); + } else { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); + } + } else { + u64 mair, sctlr; + u8 sh; + + par = SYS_PAR_EL1_NSE; + + mair = (wi->regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, MAIR_EL1) : + vcpu_read_sys_reg(vcpu, MAIR_EL2)); + + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; + mair &= 0xff; + + sctlr = (wi->regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, SCTLR_EL1) : + vcpu_read_sys_reg(vcpu, SCTLR_EL2)); + + /* Force NC for memory if SCTLR_ELx.C is clear */ + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) + mair = MEMATTR(NC, NC); + + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); + par |= wr->pa & SYS_PAR_EL1_PA; + + sh = compute_s1_sh(wi, wr, mair); + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); + } + + return par; +} + +static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + u64 sctlr; + + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + return false; + + if (s1pie_enabled(vcpu, regime)) + return true; + + if (regime == TR_EL10) + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + else + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + + return sctlr & SCTLR_EL1_EPAN; +} + +static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + bool wxn; + + /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { + case 0b00: + wr->pr = wr->pw = true; + wr->ur = wr->uw = false; + break; + case 0b01: + wr->pr = wr->pw = wr->ur = wr->uw = true; + break; + case 0b10: + wr->pr = true; + wr->pw = wr->ur = wr->uw = false; + break; + case 0b11: + wr->pr = wr->ur = true; + wr->pw = wr->uw = false; + break; + } + + /* We don't use px for anything yet, but hey... */ + wr->px = !((wr->desc & PTE_PXN) || wr->uw); + wr->ux = !(wr->desc & PTE_UXN); + } else { + wr->ur = wr->uw = wr->ux = false; + + if (!(wr->desc & PTE_RDONLY)) { + wr->pr = wr->pw = true; + } else { + wr->pr = true; + wr->pw = false; + } + + /* XN maps to UXN */ + wr->px = !(wr->desc & PTE_UXN); + } + + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); + break; + case TR_EL10: + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); + break; + } + + wr->pwxn = wr->uwxn = wxn; + wr->pov = wi->poe; + wr->uov = wi->e0poe; +} + +static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + /* Hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (wr->APTable) { + case 0b00: + break; + case 0b01: + wr->ur = wr->uw = false; + break; + case 0b10: + wr->pw = wr->uw = false; + break; + case 0b11: + wr->pw = wr->ur = wr->uw = false; + break; + } + + wr->px &= !wr->PXNTable; + wr->ux &= !wr->UXNTable; + } else { + if (wr->APTable & BIT(1)) + wr->pw = false; + + /* XN maps to UXN */ + wr->px &= !wr->UXNTable; + } +} + +#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) + +#define set_priv_perms(wr, r, w, x) \ + do { \ + (wr)->pr = (r); \ + (wr)->pw = (w); \ + (wr)->px = (x); \ + } while (0) + +#define set_unpriv_perms(wr, r, w, x) \ + do { \ + (wr)->ur = (r); \ + (wr)->uw = (w); \ + (wr)->ux = (x); \ + } while (0) + +#define set_priv_wxn(wr, v) \ + do { \ + (wr)->pwxn = (v); \ + } while (0) + +#define set_unpriv_wxn(wr, v) \ + do { \ + (wr)->uwxn = (v); \ + } while (0) + +/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ +#define set_perms(w, wr, ip) \ + do { \ + /* R_LLZDZ */ \ + switch ((ip)) { \ + case 0b0000: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b0010: \ + set_ ## w ## _perms((wr), false, false, true ); \ + break; \ + case 0b0011: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b0100: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0101: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b0110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b0111: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1000: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1010: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b1011: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1100: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b1101: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1111: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + } \ + \ + /* R_HJYGR */ \ + set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ + \ + } while (0) + +static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 up, pp, idx; + + idx = pte_pi_index(wr->desc); + + switch (wi->regime) { + case TR_EL10: + pp = perm_idx(vcpu, PIR_EL1, idx); + up = perm_idx(vcpu, PIRE0_EL1, idx); + break; + case TR_EL20: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = perm_idx(vcpu, PIRE0_EL2, idx); + break; + case TR_EL2: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = 0; + break; + } + + set_perms(priv, wr, pp); + + if (wi->regime != TR_EL2) + set_perms(unpriv, wr, up); + else + set_unpriv_perms(wr, false, false, false); + + wr->pov = wi->poe && !(pp & BIT(3)); + wr->uov = wi->e0poe && !(up & BIT(3)); + + /* R_VFPJF */ + if (wr->px && wr->uw) { + set_priv_perms(wr, false, false, false); + set_unpriv_perms(wr, false, false, false); + } +} + +static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 idx, pov_perms, uov_perms; + + idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); + + if (wr->pov) { + switch (wi->regime) { + case TR_EL10: + pov_perms = perm_idx(vcpu, POR_EL1, idx); + break; + case TR_EL20: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + break; + case TR_EL2: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + break; + } + + if (pov_perms & ~POE_RWX) + pov_perms = POE_NONE; + + /* R_QXXPC, S1PrivOverflow enabled */ + if (wr->pwxn && (pov_perms & POE_X)) + pov_perms &= ~POE_W; + + wr->pr &= pov_perms & POE_R; + wr->pw &= pov_perms & POE_W; + wr->px &= pov_perms & POE_X; + } + + if (wr->uov) { + switch (wi->regime) { + case TR_EL10: + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL20: + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL2: + uov_perms = 0; + break; + } + + if (uov_perms & ~POE_RWX) + uov_perms = POE_NONE; + + /* R_NPBXC, S1UnprivOverlay enabled */ + if (wr->uwxn && (uov_perms & POE_X)) + uov_perms &= ~POE_W; + + wr->ur &= uov_perms & POE_R; + wr->uw &= uov_perms & POE_W; + wr->ux &= uov_perms & POE_X; + } +} + +static void compute_s1_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + bool pan; + + if (!s1pie_enabled(vcpu, wi->regime)) + compute_s1_direct_permissions(vcpu, wi, wr); + else + compute_s1_indirect_permissions(vcpu, wi, wr); + + if (!wi->hpd) + compute_s1_hierarchical_permissions(vcpu, wi, wr); + + compute_s1_overlay_permissions(vcpu, wi, wr); + + /* R_QXXPC, S1PrivOverlay disabled */ + if (!wr->pov) + wr->px &= !(wr->pwxn && wr->pw); + + /* R_NPBXC, S1UnprivOverlay disabled */ + if (!wr->uov) + wr->ux &= !(wr->uwxn && wr->uw); + + pan = wi->pan && (wr->ur || wr->uw || + (pan3_enabled(vcpu, wi->regime) && wr->ux)); + wr->pw &= !pan; + wr->pr &= !pan; +} + +static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) +{ + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + bool perm_fail = false; + int ret, idx; + + wi.regime = compute_translation_regime(vcpu, op); + wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); + + ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + /* + * Race to update a descriptor -- restart the walk. + */ + if (ret == -EAGAIN) + return ret; + if (ret) + goto compute_par; + + compute_s1_permissions(vcpu, &wi, &wr); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1R: + case OP_AT_S1E2R: + perm_fail = !wr.pr; + break; + case OP_AT_S1E1WP: + case OP_AT_S1E1W: + case OP_AT_S1E2W: + perm_fail = !wr.pw; + break; + case OP_AT_S1E0R: + perm_fail = !wr.ur; + break; + case OP_AT_S1E0W: + perm_fail = !wr.uw; + break; + case OP_AT_S1E1A: + case OP_AT_S1E2A: + break; + default: + BUG(); + } + + if (perm_fail) + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); + +compute_par: + *par = compute_par_s1(vcpu, &wi, &wr); + return 0; +} + +/* + * Return the PAR_EL1 value as the result of a valid translation. + * + * If the translation is unsuccessful, the value may only contain + * PAR_EL1.F, and cannot be taken at face value. It isn't an + * indication of the translation having failed, only that the fast + * path did not succeed, *unless* it indicates a S1 permission or + * access fault. + */ +static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct mmu_config config; + struct kvm_s2_mmu *mmu; + bool fail, mmu_cs; + u64 par; + + par = SYS_PAR_EL1_F; + + /* + * We've trapped, so everything is live on the CPU. As we will + * be switching contexts behind everybody's back, disable + * interrupts while holding the mmu lock. + */ + guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); + + /* + * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already + * the right one (as we trapped from vEL2). If not, save the + * full MMU context. + * + * We are also guaranteed to be in the correct context if + * we're not in a nested VM. + */ + mmu_cs = (vcpu_has_nv(vcpu) && + !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); + if (!mmu_cs) + goto skip_mmu_switch; + + /* + * Obtaining the S2 MMU for a L2 is horribly racy, and we may not + * find it (recycled by another vcpu, for example). When this + * happens, admit defeat immediately and use the SW (slow) path. + */ + mmu = lookup_s2_mmu(vcpu); + if (!mmu) + return par; + + __mmu_config_save(&config); + + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + if (kvm_has_tcr2(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); + if (kvm_has_s1pie(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); + } + if (kvm_has_s1poe(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); + write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); + } + } + write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); + __load_stage2(mmu, mmu->arch); + +skip_mmu_switch: + /* Temporarily switch back to guest context */ + write_sysreg_hcr(vcpu->arch.hcr_el2); + isb(); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1WP: + fail = at_s1e1p_fast(vcpu, op, vaddr); + break; + case OP_AT_S1E1R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E1W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E0R: + fail = __kvm_at(OP_AT_S1E0R, vaddr); + break; + case OP_AT_S1E0W: + fail = __kvm_at(OP_AT_S1E0W, vaddr); + break; + case OP_AT_S1E1A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + break; + } + + if (!fail) + par = read_sysreg_par(); + + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); + + if (mmu_cs) + __mmu_config_restore(&config); + + return par; +} + +static bool par_check_s1_perm_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && + !(par & SYS_PAR_EL1_S)); +} + +static bool par_check_s1_access_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS && + !(par & SYS_PAR_EL1_S)); +} + +int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + int ret; + + /* + * If PAR_EL1 reports that AT failed on a S1 permission or access + * fault, we know for sure that the PTW was able to walk the S1 + * tables and there's nothing else to do. + * + * If AT failed for any other reason, then we must walk the guest S1 + * to emulate the instruction. + */ + if ((par & SYS_PAR_EL1_F) && + !par_check_s1_perm_fault(par) && + !par_check_s1_access_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; +} + +int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par; + int ret; + + /* + * We've trapped, so everything is live on the CPU. As we will be + * switching context behind everybody's back, disable interrupts... + */ + scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { + u64 val, hcr; + bool fail; + + val = hcr = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + val |= HCR_VM; + + if (!vcpu_el2_e2h_is_set(vcpu)) + val |= HCR_NV | HCR_NV1; + + write_sysreg_hcr(val); + isb(); + + par = SYS_PAR_EL1_F; + + switch (op) { + case OP_AT_S1E2R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E2W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E2A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + } + + isb(); + + if (!fail) + par = read_sysreg_par(); + + write_sysreg_hcr(hcr); + isb(); + } + + /* We failed the translation, let's replay it in slow motion */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; +} + +int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct kvm_s2_trans out = {}; + u64 ipa, par; + bool write; + int ret; + + /* Do the stage-1 translation */ + switch (op) { + case OP_AT_S12E1R: + op = OP_AT_S1E1R; + write = false; + break; + case OP_AT_S12E1W: + op = OP_AT_S1E1W; + write = true; + break; + case OP_AT_S12E0R: + op = OP_AT_S1E0R; + write = false; + break; + case OP_AT_S12E0W: + op = OP_AT_S1E0W; + write = true; + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + __kvm_at_s1e01(vcpu, op, vaddr); + par = vcpu_read_sys_reg(vcpu, PAR_EL1); + if (par & SYS_PAR_EL1_F) + return 0; + + /* + * If we only have a single stage of translation (EL2&0), exit + * early. Same thing if {VM,DC}=={0,0}. + */ + if (compute_translation_regime(vcpu, op) == TR_EL20 || + !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) + return 0; + + /* Do the stage-2 translation */ + ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); + out.esr = 0; + ret = kvm_walk_nested_s2(vcpu, ipa, &out); + if (ret < 0) + return ret; + + /* Check the access permission */ + if (!out.esr && + ((!write && !out.readable) || (write && !out.writable))) + out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); + + par = compute_par_s12(vcpu, par, &out); + vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; +} + +/* + * Translate a VA for a given EL in a given translation regime, with + * or without PAN. This requires wi->{regime, as_el0, pan} to be + * set. The rest of the wi and wr should be 0-initialised. + */ +int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + int ret; + + ret = setup_s1_walk(vcpu, wi, wr, va); + if (ret) + return ret; + + if (wr->level == S1_MMU_DISABLED) { + wr->ur = wr->uw = wr->ux = true; + wr->pr = wr->pw = wr->px = true; + } else { + ret = walk_s1(vcpu, wi, wr, va); + if (ret) + return ret; + + compute_s1_permissions(vcpu, wi, wr); + } + + return 0; +} + +struct desc_match { + u64 ipa; + int level; +}; + +static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) +{ + struct desc_match *dm = priv; + u64 ipa = dm->ipa; + + /* Use S1 granule alignment */ + ipa &= GENMASK(51, ctxt->wi->pgshift); + + /* Not the IPA we're looking for? Continue. */ + if (ipa != ctxt->table_ipa) + return 0; + + /* Note the level and interrupt the walk */ + dm->level = ctxt->level; + return -EINTR; +} + +int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) +{ + struct desc_match dm = { + .ipa = ipa, + }; + struct s1_walk_info wi = { + .filter = &(struct s1_walk_filter){ + .fn = match_s1_desc, + .priv = &dm, + }, + .as_el0 = false, + .pan = false, + }; + struct s1_walk_result wr = {}; + int ret; + + if (is_hyp_ctxt(vcpu)) + wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + else + wi.regime = TR_EL10; + + ret = setup_s1_walk(vcpu, &wi, &wr, va); + if (ret) + return ret; + + /* We really expect the S1 MMU to be on here... */ + if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { + *level = 0; + return 0; + } + + /* Walk the guest's PT, looking for a match along the way */ + ret = walk_s1(vcpu, &wi, &wr, va); + switch (ret) { + case -EINTR: + /* We interrupted the walk on a match, return the level */ + *level = dm.level; + return 0; + case 0: + /* The walk completed, we failed to find the entry */ + return -ENOENT; + default: + /* Any other error... */ + return ret; + } +} + +#ifdef CONFIG_ARM64_LSE_ATOMICS +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + u64 tmp = old; + int ret = 0; + + uaccess_enable_privileged(); + + asm volatile(__LSE_PREAMBLE + "1: cas %[old], %[new], %[addr]\n" + "2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) + : [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + if (ret) + return ret; + if (tmp != old) + return -EAGAIN; + + return ret; +} +#else +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + return -EINVAL; +} +#endif + +static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + int ret = 1; + u64 tmp; + + uaccess_enable_privileged(); + + asm volatile("prfm pstl1strm, %[addr]\n" + "1: ldxr %[tmp], %[addr]\n" + "sub %[tmp], %[tmp], %[old]\n" + "cbnz %[tmp], 3f\n" + "2: stlxr %w[ret], %[new], %[addr]\n" + "3:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) + : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) + : [old] "r" (old), [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + /* STLXR didn't update the descriptor, or the compare failed */ + if (ret == 1) + return -EAGAIN; + + return ret; +} + +int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + u64 __user *ptep; + bool writable; + int offset; + gfn_t gfn; + int r; + + lockdep_assert(srcu_read_lock_held(&kvm->srcu)); + + gfn = ipa >> PAGE_SHIFT; + offset = offset_in_page(ipa); + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return -EINVAL; + if (!writable) + return -EPERM; + + ptep = (u64 __user *)hva + offset; + if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) + r = __lse_swap_desc(ptep, old, new); + else + r = __llsc_swap_desc(ptep, old, new); + + if (r < 0) + return r; + + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; +} diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c new file mode 100644 index 000000000000..24bb3f36e9d5 --- /dev/null +++ b/arch/arm64/kvm/config.c @@ -0,0 +1,1520 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> +#include <asm/sysreg.h> + +/* + * Describes the dependencies between a set of bits (or the negation + * of a set of RES0 bits) and a feature. The flags indicate how the + * data is interpreted. + */ +struct reg_bits_to_feat_map { + union { + u64 bits; + u64 *res0p; + }; + +#define NEVER_FGU BIT(0) /* Can trap, but never UNDEF */ +#define CALL_FUNC BIT(1) /* Needs to evaluate tons of crap */ +#define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */ +#define RES0_POINTER BIT(3) /* Pointer to RES0 value instead of bits */ + + unsigned long flags; + + union { + struct { + u8 regidx; + u8 shift; + u8 width; + bool sign; + s8 lo_lim; + }; + bool (*match)(struct kvm *); + bool (*fval)(struct kvm *, u64 *); + }; +}; + +/* + * Describes the dependencies for a given register: + * + * @feat_map describes the dependency for the whole register. If the + * features the register depends on are not present, the whole + * register is effectively RES0. + * + * @bit_feat_map describes the dependencies for a set of bits in that + * register. If the features these bits depend on are not present, the + * bits are effectively RES0. + */ +struct reg_feat_map_desc { + const char *name; + const struct reg_bits_to_feat_map feat_map; + const struct reg_bits_to_feat_map *bit_feat_map; + const unsigned int bit_feat_map_sz; +}; + +#define __NEEDS_FEAT_3(m, f, w, id, fld, lim) \ + { \ + .w = (m), \ + .flags = (f), \ + .regidx = IDREG_IDX(SYS_ ## id), \ + .shift = id ##_## fld ## _SHIFT, \ + .width = id ##_## fld ## _WIDTH, \ + .sign = id ##_## fld ## _SIGNED, \ + .lo_lim = id ##_## fld ##_## lim \ + } + +#define __NEEDS_FEAT_2(m, f, w, fun, dummy) \ + { \ + .w = (m), \ + .flags = (f) | CALL_FUNC, \ + .fval = (fun), \ + } + +#define __NEEDS_FEAT_1(m, f, w, fun) \ + { \ + .w = (m), \ + .flags = (f) | CALL_FUNC, \ + .match = (fun), \ + } + +#define __NEEDS_FEAT_FLAG(m, f, w, ...) \ + CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, w, __VA_ARGS__) + +#define NEEDS_FEAT_FLAG(m, f, ...) \ + __NEEDS_FEAT_FLAG(m, f, bits, __VA_ARGS__) + +#define NEEDS_FEAT_FIXED(m, ...) \ + __NEEDS_FEAT_FLAG(m, FIXED_VALUE, bits, __VA_ARGS__, 0) + +#define NEEDS_FEAT_RES0(p, ...) \ + __NEEDS_FEAT_FLAG(p, RES0_POINTER, res0p, __VA_ARGS__) + +/* + * Declare the dependency between a set of bits and a set of features, + * generating a struct reg_bit_to_feat_map. + */ +#define NEEDS_FEAT(m, ...) NEEDS_FEAT_FLAG(m, 0, __VA_ARGS__) + +/* + * Declare the dependency between a non-FGT register, a set of + * feature, and the set of individual bits it contains. This generates + * a struct reg_feat_map_desc. + */ +#define DECLARE_FEAT_MAP(n, r, m, f) \ + struct reg_feat_map_desc n = { \ + .name = #r, \ + .feat_map = NEEDS_FEAT(~r##_RES0, f), \ + .bit_feat_map = m, \ + .bit_feat_map_sz = ARRAY_SIZE(m), \ + } + +/* + * Specialised version of the above for FGT registers that have their + * RES0 masks described as struct fgt_masks. + */ +#define DECLARE_FEAT_MAP_FGT(n, msk, m, f) \ + struct reg_feat_map_desc n = { \ + .name = #msk, \ + .feat_map = NEEDS_FEAT_RES0(&msk.res0, f),\ + .bit_feat_map = m, \ + .bit_feat_map_sz = ARRAY_SIZE(m), \ + } + +#define FEAT_SPE ID_AA64DFR0_EL1, PMSVer, IMP +#define FEAT_SPE_FnE ID_AA64DFR0_EL1, PMSVer, V1P2 +#define FEAT_BRBE ID_AA64DFR0_EL1, BRBE, IMP +#define FEAT_TRC_SR ID_AA64DFR0_EL1, TraceVer, IMP +#define FEAT_PMUv3 ID_AA64DFR0_EL1, PMUVer, IMP +#define FEAT_TRBE ID_AA64DFR0_EL1, TraceBuffer, IMP +#define FEAT_TRBEv1p1 ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1 +#define FEAT_DoubleLock ID_AA64DFR0_EL1, DoubleLock, IMP +#define FEAT_TRF ID_AA64DFR0_EL1, TraceFilt, IMP +#define FEAT_AA32EL0 ID_AA64PFR0_EL1, EL0, AARCH32 +#define FEAT_AA32EL1 ID_AA64PFR0_EL1, EL1, AARCH32 +#define FEAT_AA64EL1 ID_AA64PFR0_EL1, EL1, IMP +#define FEAT_AA64EL2 ID_AA64PFR0_EL1, EL2, IMP +#define FEAT_AA64EL3 ID_AA64PFR0_EL1, EL3, IMP +#define FEAT_AIE ID_AA64MMFR3_EL1, AIE, IMP +#define FEAT_S2POE ID_AA64MMFR3_EL1, S2POE, IMP +#define FEAT_S1POE ID_AA64MMFR3_EL1, S1POE, IMP +#define FEAT_S1PIE ID_AA64MMFR3_EL1, S1PIE, IMP +#define FEAT_THE ID_AA64PFR1_EL1, THE, IMP +#define FEAT_SME ID_AA64PFR1_EL1, SME, IMP +#define FEAT_GCS ID_AA64PFR1_EL1, GCS, IMP +#define FEAT_LS64 ID_AA64ISAR1_EL1, LS64, LS64 +#define FEAT_LS64_V ID_AA64ISAR1_EL1, LS64, LS64_V +#define FEAT_LS64_ACCDATA ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA +#define FEAT_RAS ID_AA64PFR0_EL1, RAS, IMP +#define FEAT_RASv2 ID_AA64PFR0_EL1, RAS, V2 +#define FEAT_GICv3 ID_AA64PFR0_EL1, GIC, IMP +#define FEAT_LOR ID_AA64MMFR1_EL1, LO, IMP +#define FEAT_SPEv1p2 ID_AA64DFR0_EL1, PMSVer, V1P2 +#define FEAT_SPEv1p4 ID_AA64DFR0_EL1, PMSVer, V1P4 +#define FEAT_SPEv1p5 ID_AA64DFR0_EL1, PMSVer, V1P5 +#define FEAT_ATS1A ID_AA64ISAR2_EL1, ATS1A, IMP +#define FEAT_SPECRES2 ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX +#define FEAT_SPECRES ID_AA64ISAR1_EL1, SPECRES, IMP +#define FEAT_TLBIRANGE ID_AA64ISAR0_EL1, TLB, RANGE +#define FEAT_TLBIOS ID_AA64ISAR0_EL1, TLB, OS +#define FEAT_PAN2 ID_AA64MMFR1_EL1, PAN, PAN2 +#define FEAT_DPB2 ID_AA64ISAR1_EL1, DPB, DPB2 +#define FEAT_AMUv1 ID_AA64PFR0_EL1, AMU, IMP +#define FEAT_AMUv1p1 ID_AA64PFR0_EL1, AMU, V1P1 +#define FEAT_CMOW ID_AA64MMFR1_EL1, CMOW, IMP +#define FEAT_D128 ID_AA64MMFR3_EL1, D128, IMP +#define FEAT_DoubleFault2 ID_AA64PFR1_EL1, DF2, IMP +#define FEAT_FPMR ID_AA64PFR2_EL1, FPMR, IMP +#define FEAT_MOPS ID_AA64ISAR2_EL1, MOPS, IMP +#define FEAT_NMI ID_AA64PFR1_EL1, NMI, IMP +#define FEAT_SCTLR2 ID_AA64MMFR3_EL1, SCTLRX, IMP +#define FEAT_SYSREG128 ID_AA64ISAR2_EL1, SYSREG_128, IMP +#define FEAT_TCR2 ID_AA64MMFR3_EL1, TCRX, IMP +#define FEAT_XS ID_AA64ISAR1_EL1, XS, IMP +#define FEAT_EVT ID_AA64MMFR2_EL1, EVT, IMP +#define FEAT_EVT_TTLBxS ID_AA64MMFR2_EL1, EVT, TTLBxS +#define FEAT_MTE2 ID_AA64PFR1_EL1, MTE, MTE2 +#define FEAT_RME ID_AA64PFR0_EL1, RME, IMP +#define FEAT_MPAM ID_AA64PFR0_EL1, MPAM, 1 +#define FEAT_S2FWB ID_AA64MMFR2_EL1, FWB, IMP +#define FEAT_TME ID_AA64ISAR0_EL1, TME, IMP +#define FEAT_TWED ID_AA64MMFR1_EL1, TWED, IMP +#define FEAT_E2H0 ID_AA64MMFR4_EL1, E2H0, IMP +#define FEAT_SRMASK ID_AA64MMFR4_EL1, SRMASK, IMP +#define FEAT_PoPS ID_AA64MMFR4_EL1, PoPS, IMP +#define FEAT_PFAR ID_AA64PFR1_EL1, PFAR, IMP +#define FEAT_Debugv8p9 ID_AA64DFR0_EL1, PMUVer, V3P9 +#define FEAT_PMUv3_SS ID_AA64DFR0_EL1, PMSS, IMP +#define FEAT_SEBEP ID_AA64DFR0_EL1, SEBEP, IMP +#define FEAT_EBEP ID_AA64DFR1_EL1, EBEP, IMP +#define FEAT_ITE ID_AA64DFR1_EL1, ITE, IMP +#define FEAT_PMUv3_ICNTR ID_AA64DFR1_EL1, PMICNTR, IMP +#define FEAT_SPMU ID_AA64DFR1_EL1, SPMU, IMP +#define FEAT_SPE_nVM ID_AA64DFR2_EL1, SPE_nVM, IMP +#define FEAT_STEP2 ID_AA64DFR2_EL1, STEP, IMP +#define FEAT_CPA2 ID_AA64ISAR3_EL1, CPA, CPA2 +#define FEAT_ASID2 ID_AA64MMFR4_EL1, ASID2, IMP +#define FEAT_MEC ID_AA64MMFR3_EL1, MEC, IMP +#define FEAT_HAFT ID_AA64MMFR1_EL1, HAFDBS, HAFT +#define FEAT_BTI ID_AA64PFR1_EL1, BT, IMP +#define FEAT_ExS ID_AA64MMFR0_EL1, EXS, IMP +#define FEAT_IESB ID_AA64MMFR2_EL1, IESB, IMP +#define FEAT_LSE2 ID_AA64MMFR2_EL1, AT, IMP +#define FEAT_LSMAOC ID_AA64MMFR2_EL1, LSM, IMP +#define FEAT_MixedEnd ID_AA64MMFR0_EL1, BIGEND, IMP +#define FEAT_MixedEndEL0 ID_AA64MMFR0_EL1, BIGENDEL0, IMP +#define FEAT_MTE_ASYNC ID_AA64PFR1_EL1, MTE_frac, ASYNC +#define FEAT_MTE_STORE_ONLY ID_AA64PFR2_EL1, MTESTOREONLY, IMP +#define FEAT_PAN ID_AA64MMFR1_EL1, PAN, IMP +#define FEAT_PAN3 ID_AA64MMFR1_EL1, PAN, PAN3 +#define FEAT_SSBS ID_AA64PFR1_EL1, SSBS, IMP +#define FEAT_TIDCP1 ID_AA64MMFR1_EL1, TIDCP1, IMP +#define FEAT_FGT ID_AA64MMFR0_EL1, FGT, IMP +#define FEAT_FGT2 ID_AA64MMFR0_EL1, FGT, FGT2 +#define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP +#define FEAT_HCX ID_AA64MMFR1_EL1, HCX, IMP + +static bool not_feat_aa64el3(struct kvm *kvm) +{ + return !kvm_has_feat(kvm, FEAT_AA64EL3); +} + +static bool feat_nv2(struct kvm *kvm) +{ + return ((kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) && + kvm_has_feat_enum(kvm, ID_AA64MMFR2_EL1, NV, NI)) || + kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2)); +} + +static bool feat_nv2_e2h0_ni(struct kvm *kvm) +{ + return feat_nv2(kvm) && !kvm_has_feat(kvm, FEAT_E2H0); +} + +static bool feat_rasv1p1(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) || + (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) && + kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1))); +} + +static bool feat_csv2_2_csv2_1p2(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) || + (kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2) && + kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, CSV2, IMP))); +} + +static bool feat_pauth(struct kvm *kvm) +{ + return kvm_has_pauth(kvm, PAuth); +} + +static bool feat_pauth_lr(struct kvm *kvm) +{ + return kvm_has_pauth(kvm, PAuth_LR); +} + +static bool feat_aderr(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ADERR, FEAT_ADERR) && + kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SDERR, FEAT_ADERR)); +} + +static bool feat_anerr(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ANERR, FEAT_ANERR) && + kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SNERR, FEAT_ANERR)); +} + +static bool feat_sme_smps(struct kvm *kvm) +{ + /* + * Revists this if KVM ever supports SME -- this really should + * look at the guest's view of SMIDR_EL1. Funnily enough, this + * is not captured in the JSON file, but only as a note in the + * ARM ARM. + */ + return (kvm_has_feat(kvm, FEAT_SME) && + (read_sysreg_s(SYS_SMIDR_EL1) & SMIDR_EL1_SMPS)); +} + +static bool feat_spe_fds(struct kvm *kvm) +{ + /* + * Revists this if KVM ever supports SPE -- this really should + * look at the guest's view of PMSIDR_EL1. + */ + return (kvm_has_feat(kvm, FEAT_SPEv1p4) && + (read_sysreg_s(SYS_PMSIDR_EL1) & PMSIDR_EL1_FDS)); +} + +static bool feat_trbe_mpam(struct kvm *kvm) +{ + /* + * Revists this if KVM ever supports both MPAM and TRBE -- + * this really should look at the guest's view of TRBIDR_EL1. + */ + return (kvm_has_feat(kvm, FEAT_TRBE) && + kvm_has_feat(kvm, FEAT_MPAM) && + (read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_MPAM)); +} + +static bool feat_asid2_e2h1(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_ASID2) && !kvm_has_feat(kvm, FEAT_E2H0); +} + +static bool feat_d128_e2h1(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_D128) && !kvm_has_feat(kvm, FEAT_E2H0); +} + +static bool feat_mec_e2h1(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_MEC) && !kvm_has_feat(kvm, FEAT_E2H0); +} + +static bool feat_ebep_pmuv3_ss(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_EBEP) || kvm_has_feat(kvm, FEAT_PMUv3_SS); +} + +static bool feat_mixedendel0(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_MixedEnd) || kvm_has_feat(kvm, FEAT_MixedEndEL0); +} + +static bool feat_mte_async(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_MTE2) && kvm_has_feat_enum(kvm, FEAT_MTE_ASYNC); +} + +#define check_pmu_revision(k, r) \ + ({ \ + (kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, r) && \ + !kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, IMP_DEF)); \ + }) + +static bool feat_pmuv3p1(struct kvm *kvm) +{ + return check_pmu_revision(kvm, V3P1); +} + +static bool feat_pmuv3p5(struct kvm *kvm) +{ + return check_pmu_revision(kvm, V3P5); +} + +static bool feat_pmuv3p7(struct kvm *kvm) +{ + return check_pmu_revision(kvm, V3P7); +} + +static bool feat_pmuv3p9(struct kvm *kvm) +{ + return check_pmu_revision(kvm, V3P9); +} + +static bool compute_hcr_rw(struct kvm *kvm, u64 *bits) +{ + /* This is purely academic: AArch32 and NV are mutually exclusive */ + if (bits) { + if (kvm_has_feat(kvm, FEAT_AA32EL1)) + *bits &= ~HCR_EL2_RW; + else + *bits |= HCR_EL2_RW; + } + + return true; +} + +static bool compute_hcr_e2h(struct kvm *kvm, u64 *bits) +{ + if (bits) { + if (kvm_has_feat(kvm, FEAT_E2H0)) + *bits &= ~HCR_EL2_E2H; + else + *bits |= HCR_EL2_E2H; + } + + return true; +} + +static const struct reg_bits_to_feat_map hfgrtr_feat_map[] = { + NEEDS_FEAT(HFGRTR_EL2_nAMAIR2_EL1 | + HFGRTR_EL2_nMAIR2_EL1, + FEAT_AIE), + NEEDS_FEAT(HFGRTR_EL2_nS2POR_EL1, FEAT_S2POE), + NEEDS_FEAT(HFGRTR_EL2_nPOR_EL1 | + HFGRTR_EL2_nPOR_EL0, + FEAT_S1POE), + NEEDS_FEAT(HFGRTR_EL2_nPIR_EL1 | + HFGRTR_EL2_nPIRE0_EL1, + FEAT_S1PIE), + NEEDS_FEAT(HFGRTR_EL2_nRCWMASK_EL1, FEAT_THE), + NEEDS_FEAT(HFGRTR_EL2_nTPIDR2_EL0 | + HFGRTR_EL2_nSMPRI_EL1, + FEAT_SME), + NEEDS_FEAT(HFGRTR_EL2_nGCS_EL1 | + HFGRTR_EL2_nGCS_EL0, + FEAT_GCS), + NEEDS_FEAT(HFGRTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA), + NEEDS_FEAT(HFGRTR_EL2_ERXADDR_EL1 | + HFGRTR_EL2_ERXMISCn_EL1 | + HFGRTR_EL2_ERXSTATUS_EL1 | + HFGRTR_EL2_ERXCTLR_EL1 | + HFGRTR_EL2_ERXFR_EL1 | + HFGRTR_EL2_ERRSELR_EL1 | + HFGRTR_EL2_ERRIDR_EL1, + FEAT_RAS), + NEEDS_FEAT(HFGRTR_EL2_ERXPFGCDN_EL1 | + HFGRTR_EL2_ERXPFGCTL_EL1 | + HFGRTR_EL2_ERXPFGF_EL1, + feat_rasv1p1), + NEEDS_FEAT(HFGRTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3), + NEEDS_FEAT(HFGRTR_EL2_SCXTNUM_EL0 | + HFGRTR_EL2_SCXTNUM_EL1, + feat_csv2_2_csv2_1p2), + NEEDS_FEAT(HFGRTR_EL2_LORSA_EL1 | + HFGRTR_EL2_LORN_EL1 | + HFGRTR_EL2_LORID_EL1 | + HFGRTR_EL2_LOREA_EL1 | + HFGRTR_EL2_LORC_EL1, + FEAT_LOR), + NEEDS_FEAT(HFGRTR_EL2_APIBKey | + HFGRTR_EL2_APIAKey | + HFGRTR_EL2_APGAKey | + HFGRTR_EL2_APDBKey | + HFGRTR_EL2_APDAKey, + feat_pauth), + NEEDS_FEAT_FLAG(HFGRTR_EL2_VBAR_EL1 | + HFGRTR_EL2_TTBR1_EL1 | + HFGRTR_EL2_TTBR0_EL1 | + HFGRTR_EL2_TPIDR_EL0 | + HFGRTR_EL2_TPIDRRO_EL0 | + HFGRTR_EL2_TPIDR_EL1 | + HFGRTR_EL2_TCR_EL1 | + HFGRTR_EL2_SCTLR_EL1 | + HFGRTR_EL2_REVIDR_EL1 | + HFGRTR_EL2_PAR_EL1 | + HFGRTR_EL2_MPIDR_EL1 | + HFGRTR_EL2_MIDR_EL1 | + HFGRTR_EL2_MAIR_EL1 | + HFGRTR_EL2_ISR_EL1 | + HFGRTR_EL2_FAR_EL1 | + HFGRTR_EL2_ESR_EL1 | + HFGRTR_EL2_DCZID_EL0 | + HFGRTR_EL2_CTR_EL0 | + HFGRTR_EL2_CSSELR_EL1 | + HFGRTR_EL2_CPACR_EL1 | + HFGRTR_EL2_CONTEXTIDR_EL1| + HFGRTR_EL2_CLIDR_EL1 | + HFGRTR_EL2_CCSIDR_EL1 | + HFGRTR_EL2_AMAIR_EL1 | + HFGRTR_EL2_AIDR_EL1 | + HFGRTR_EL2_AFSR1_EL1 | + HFGRTR_EL2_AFSR0_EL1, + NEVER_FGU, FEAT_AA64EL1), +}; + + +static const DECLARE_FEAT_MAP_FGT(hfgrtr_desc, hfgrtr_masks, + hfgrtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hfgwtr_feat_map[] = { + NEEDS_FEAT(HFGWTR_EL2_nAMAIR2_EL1 | + HFGWTR_EL2_nMAIR2_EL1, + FEAT_AIE), + NEEDS_FEAT(HFGWTR_EL2_nS2POR_EL1, FEAT_S2POE), + NEEDS_FEAT(HFGWTR_EL2_nPOR_EL1 | + HFGWTR_EL2_nPOR_EL0, + FEAT_S1POE), + NEEDS_FEAT(HFGWTR_EL2_nPIR_EL1 | + HFGWTR_EL2_nPIRE0_EL1, + FEAT_S1PIE), + NEEDS_FEAT(HFGWTR_EL2_nRCWMASK_EL1, FEAT_THE), + NEEDS_FEAT(HFGWTR_EL2_nTPIDR2_EL0 | + HFGWTR_EL2_nSMPRI_EL1, + FEAT_SME), + NEEDS_FEAT(HFGWTR_EL2_nGCS_EL1 | + HFGWTR_EL2_nGCS_EL0, + FEAT_GCS), + NEEDS_FEAT(HFGWTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA), + NEEDS_FEAT(HFGWTR_EL2_ERXADDR_EL1 | + HFGWTR_EL2_ERXMISCn_EL1 | + HFGWTR_EL2_ERXSTATUS_EL1 | + HFGWTR_EL2_ERXCTLR_EL1 | + HFGWTR_EL2_ERRSELR_EL1, + FEAT_RAS), + NEEDS_FEAT(HFGWTR_EL2_ERXPFGCDN_EL1 | + HFGWTR_EL2_ERXPFGCTL_EL1, + feat_rasv1p1), + NEEDS_FEAT(HFGWTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3), + NEEDS_FEAT(HFGWTR_EL2_SCXTNUM_EL0 | + HFGWTR_EL2_SCXTNUM_EL1, + feat_csv2_2_csv2_1p2), + NEEDS_FEAT(HFGWTR_EL2_LORSA_EL1 | + HFGWTR_EL2_LORN_EL1 | + HFGWTR_EL2_LOREA_EL1 | + HFGWTR_EL2_LORC_EL1, + FEAT_LOR), + NEEDS_FEAT(HFGWTR_EL2_APIBKey | + HFGWTR_EL2_APIAKey | + HFGWTR_EL2_APGAKey | + HFGWTR_EL2_APDBKey | + HFGWTR_EL2_APDAKey, + feat_pauth), + NEEDS_FEAT_FLAG(HFGWTR_EL2_VBAR_EL1 | + HFGWTR_EL2_TTBR1_EL1 | + HFGWTR_EL2_TTBR0_EL1 | + HFGWTR_EL2_TPIDR_EL0 | + HFGWTR_EL2_TPIDRRO_EL0 | + HFGWTR_EL2_TPIDR_EL1 | + HFGWTR_EL2_TCR_EL1 | + HFGWTR_EL2_SCTLR_EL1 | + HFGWTR_EL2_PAR_EL1 | + HFGWTR_EL2_MAIR_EL1 | + HFGWTR_EL2_FAR_EL1 | + HFGWTR_EL2_ESR_EL1 | + HFGWTR_EL2_CSSELR_EL1 | + HFGWTR_EL2_CPACR_EL1 | + HFGWTR_EL2_CONTEXTIDR_EL1| + HFGWTR_EL2_AMAIR_EL1 | + HFGWTR_EL2_AFSR1_EL1 | + HFGWTR_EL2_AFSR0_EL1, + NEVER_FGU, FEAT_AA64EL1), +}; + +static const DECLARE_FEAT_MAP_FGT(hfgwtr_desc, hfgwtr_masks, + hfgwtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hdfgrtr_feat_map[] = { + NEEDS_FEAT(HDFGRTR_EL2_PMBIDR_EL1 | + HDFGRTR_EL2_PMSLATFR_EL1 | + HDFGRTR_EL2_PMSIRR_EL1 | + HDFGRTR_EL2_PMSIDR_EL1 | + HDFGRTR_EL2_PMSICR_EL1 | + HDFGRTR_EL2_PMSFCR_EL1 | + HDFGRTR_EL2_PMSEVFR_EL1 | + HDFGRTR_EL2_PMSCR_EL1 | + HDFGRTR_EL2_PMBSR_EL1 | + HDFGRTR_EL2_PMBPTR_EL1 | + HDFGRTR_EL2_PMBLIMITR_EL1, + FEAT_SPE), + NEEDS_FEAT(HDFGRTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE), + NEEDS_FEAT(HDFGRTR_EL2_nBRBDATA | + HDFGRTR_EL2_nBRBCTL | + HDFGRTR_EL2_nBRBIDR, + FEAT_BRBE), + NEEDS_FEAT(HDFGRTR_EL2_TRCVICTLR | + HDFGRTR_EL2_TRCSTATR | + HDFGRTR_EL2_TRCSSCSRn | + HDFGRTR_EL2_TRCSEQSTR | + HDFGRTR_EL2_TRCPRGCTLR | + HDFGRTR_EL2_TRCOSLSR | + HDFGRTR_EL2_TRCIMSPECn | + HDFGRTR_EL2_TRCID | + HDFGRTR_EL2_TRCCNTVRn | + HDFGRTR_EL2_TRCCLAIM | + HDFGRTR_EL2_TRCAUXCTLR | + HDFGRTR_EL2_TRCAUTHSTATUS | + HDFGRTR_EL2_TRC, + FEAT_TRC_SR), + NEEDS_FEAT(HDFGRTR_EL2_PMCEIDn_EL0 | + HDFGRTR_EL2_PMUSERENR_EL0 | + HDFGRTR_EL2_PMMIR_EL1 | + HDFGRTR_EL2_PMSELR_EL0 | + HDFGRTR_EL2_PMOVS | + HDFGRTR_EL2_PMINTEN | + HDFGRTR_EL2_PMCNTEN | + HDFGRTR_EL2_PMCCNTR_EL0 | + HDFGRTR_EL2_PMCCFILTR_EL0 | + HDFGRTR_EL2_PMEVTYPERn_EL0 | + HDFGRTR_EL2_PMEVCNTRn_EL0, + FEAT_PMUv3), + NEEDS_FEAT(HDFGRTR_EL2_TRBTRG_EL1 | + HDFGRTR_EL2_TRBSR_EL1 | + HDFGRTR_EL2_TRBPTR_EL1 | + HDFGRTR_EL2_TRBMAR_EL1 | + HDFGRTR_EL2_TRBLIMITR_EL1 | + HDFGRTR_EL2_TRBIDR_EL1 | + HDFGRTR_EL2_TRBBASER_EL1, + FEAT_TRBE), + NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSDLR_EL1, NEVER_FGU, + FEAT_DoubleLock), + NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSECCR_EL1 | + HDFGRTR_EL2_OSLSR_EL1 | + HDFGRTR_EL2_DBGPRCR_EL1 | + HDFGRTR_EL2_DBGAUTHSTATUS_EL1| + HDFGRTR_EL2_DBGCLAIM | + HDFGRTR_EL2_MDSCR_EL1 | + HDFGRTR_EL2_DBGWVRn_EL1 | + HDFGRTR_EL2_DBGWCRn_EL1 | + HDFGRTR_EL2_DBGBVRn_EL1 | + HDFGRTR_EL2_DBGBCRn_EL1, + NEVER_FGU, FEAT_AA64EL1) +}; + +static const DECLARE_FEAT_MAP_FGT(hdfgrtr_desc, hdfgrtr_masks, + hdfgrtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hdfgwtr_feat_map[] = { + NEEDS_FEAT(HDFGWTR_EL2_PMSLATFR_EL1 | + HDFGWTR_EL2_PMSIRR_EL1 | + HDFGWTR_EL2_PMSICR_EL1 | + HDFGWTR_EL2_PMSFCR_EL1 | + HDFGWTR_EL2_PMSEVFR_EL1 | + HDFGWTR_EL2_PMSCR_EL1 | + HDFGWTR_EL2_PMBSR_EL1 | + HDFGWTR_EL2_PMBPTR_EL1 | + HDFGWTR_EL2_PMBLIMITR_EL1, + FEAT_SPE), + NEEDS_FEAT(HDFGWTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE), + NEEDS_FEAT(HDFGWTR_EL2_nBRBDATA | + HDFGWTR_EL2_nBRBCTL, + FEAT_BRBE), + NEEDS_FEAT(HDFGWTR_EL2_TRCVICTLR | + HDFGWTR_EL2_TRCSSCSRn | + HDFGWTR_EL2_TRCSEQSTR | + HDFGWTR_EL2_TRCPRGCTLR | + HDFGWTR_EL2_TRCOSLAR | + HDFGWTR_EL2_TRCIMSPECn | + HDFGWTR_EL2_TRCCNTVRn | + HDFGWTR_EL2_TRCCLAIM | + HDFGWTR_EL2_TRCAUXCTLR | + HDFGWTR_EL2_TRC, + FEAT_TRC_SR), + NEEDS_FEAT(HDFGWTR_EL2_PMUSERENR_EL0 | + HDFGWTR_EL2_PMCR_EL0 | + HDFGWTR_EL2_PMSWINC_EL0 | + HDFGWTR_EL2_PMSELR_EL0 | + HDFGWTR_EL2_PMOVS | + HDFGWTR_EL2_PMINTEN | + HDFGWTR_EL2_PMCNTEN | + HDFGWTR_EL2_PMCCNTR_EL0 | + HDFGWTR_EL2_PMCCFILTR_EL0 | + HDFGWTR_EL2_PMEVTYPERn_EL0 | + HDFGWTR_EL2_PMEVCNTRn_EL0, + FEAT_PMUv3), + NEEDS_FEAT(HDFGWTR_EL2_TRBTRG_EL1 | + HDFGWTR_EL2_TRBSR_EL1 | + HDFGWTR_EL2_TRBPTR_EL1 | + HDFGWTR_EL2_TRBMAR_EL1 | + HDFGWTR_EL2_TRBLIMITR_EL1 | + HDFGWTR_EL2_TRBBASER_EL1, + FEAT_TRBE), + NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSDLR_EL1, + NEVER_FGU, FEAT_DoubleLock), + NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSECCR_EL1 | + HDFGWTR_EL2_OSLAR_EL1 | + HDFGWTR_EL2_DBGPRCR_EL1 | + HDFGWTR_EL2_DBGCLAIM | + HDFGWTR_EL2_MDSCR_EL1 | + HDFGWTR_EL2_DBGWVRn_EL1 | + HDFGWTR_EL2_DBGWCRn_EL1 | + HDFGWTR_EL2_DBGBVRn_EL1 | + HDFGWTR_EL2_DBGBCRn_EL1, + NEVER_FGU, FEAT_AA64EL1), + NEEDS_FEAT(HDFGWTR_EL2_TRFCR_EL1, FEAT_TRF), +}; + +static const DECLARE_FEAT_MAP_FGT(hdfgwtr_desc, hdfgwtr_masks, + hdfgwtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hfgitr_feat_map[] = { + NEEDS_FEAT(HFGITR_EL2_PSBCSYNC, FEAT_SPEv1p5), + NEEDS_FEAT(HFGITR_EL2_ATS1E1A, FEAT_ATS1A), + NEEDS_FEAT(HFGITR_EL2_COSPRCTX, FEAT_SPECRES2), + NEEDS_FEAT(HFGITR_EL2_nGCSEPP | + HFGITR_EL2_nGCSSTR_EL1 | + HFGITR_EL2_nGCSPUSHM_EL1, + FEAT_GCS), + NEEDS_FEAT(HFGITR_EL2_nBRBIALL | + HFGITR_EL2_nBRBINJ, + FEAT_BRBE), + NEEDS_FEAT(HFGITR_EL2_CPPRCTX | + HFGITR_EL2_DVPRCTX | + HFGITR_EL2_CFPRCTX, + FEAT_SPECRES), + NEEDS_FEAT(HFGITR_EL2_TLBIRVAALE1 | + HFGITR_EL2_TLBIRVALE1 | + HFGITR_EL2_TLBIRVAAE1 | + HFGITR_EL2_TLBIRVAE1 | + HFGITR_EL2_TLBIRVAALE1IS | + HFGITR_EL2_TLBIRVALE1IS | + HFGITR_EL2_TLBIRVAAE1IS | + HFGITR_EL2_TLBIRVAE1IS | + HFGITR_EL2_TLBIRVAALE1OS | + HFGITR_EL2_TLBIRVALE1OS | + HFGITR_EL2_TLBIRVAAE1OS | + HFGITR_EL2_TLBIRVAE1OS, + FEAT_TLBIRANGE), + NEEDS_FEAT(HFGITR_EL2_TLBIVAALE1OS | + HFGITR_EL2_TLBIVALE1OS | + HFGITR_EL2_TLBIVAAE1OS | + HFGITR_EL2_TLBIASIDE1OS | + HFGITR_EL2_TLBIVAE1OS | + HFGITR_EL2_TLBIVMALLE1OS, + FEAT_TLBIOS), + NEEDS_FEAT(HFGITR_EL2_ATS1E1WP | + HFGITR_EL2_ATS1E1RP, + FEAT_PAN2), + NEEDS_FEAT(HFGITR_EL2_DCCVADP, FEAT_DPB2), + NEEDS_FEAT_FLAG(HFGITR_EL2_DCCVAC | + HFGITR_EL2_SVC_EL1 | + HFGITR_EL2_SVC_EL0 | + HFGITR_EL2_ERET | + HFGITR_EL2_TLBIVAALE1 | + HFGITR_EL2_TLBIVALE1 | + HFGITR_EL2_TLBIVAAE1 | + HFGITR_EL2_TLBIASIDE1 | + HFGITR_EL2_TLBIVAE1 | + HFGITR_EL2_TLBIVMALLE1 | + HFGITR_EL2_TLBIVAALE1IS | + HFGITR_EL2_TLBIVALE1IS | + HFGITR_EL2_TLBIVAAE1IS | + HFGITR_EL2_TLBIASIDE1IS | + HFGITR_EL2_TLBIVAE1IS | + HFGITR_EL2_TLBIVMALLE1IS| + HFGITR_EL2_ATS1E0W | + HFGITR_EL2_ATS1E0R | + HFGITR_EL2_ATS1E1W | + HFGITR_EL2_ATS1E1R | + HFGITR_EL2_DCZVA | + HFGITR_EL2_DCCIVAC | + HFGITR_EL2_DCCVAP | + HFGITR_EL2_DCCVAU | + HFGITR_EL2_DCCISW | + HFGITR_EL2_DCCSW | + HFGITR_EL2_DCISW | + HFGITR_EL2_DCIVAC | + HFGITR_EL2_ICIVAU | + HFGITR_EL2_ICIALLU | + HFGITR_EL2_ICIALLUIS, + NEVER_FGU, FEAT_AA64EL1), +}; + +static const DECLARE_FEAT_MAP_FGT(hfgitr_desc, hfgitr_masks, + hfgitr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hafgrtr_feat_map[] = { + NEEDS_FEAT(HAFGRTR_EL2_AMEVTYPER115_EL0 | + HAFGRTR_EL2_AMEVTYPER114_EL0 | + HAFGRTR_EL2_AMEVTYPER113_EL0 | + HAFGRTR_EL2_AMEVTYPER112_EL0 | + HAFGRTR_EL2_AMEVTYPER111_EL0 | + HAFGRTR_EL2_AMEVTYPER110_EL0 | + HAFGRTR_EL2_AMEVTYPER19_EL0 | + HAFGRTR_EL2_AMEVTYPER18_EL0 | + HAFGRTR_EL2_AMEVTYPER17_EL0 | + HAFGRTR_EL2_AMEVTYPER16_EL0 | + HAFGRTR_EL2_AMEVTYPER15_EL0 | + HAFGRTR_EL2_AMEVTYPER14_EL0 | + HAFGRTR_EL2_AMEVTYPER13_EL0 | + HAFGRTR_EL2_AMEVTYPER12_EL0 | + HAFGRTR_EL2_AMEVTYPER11_EL0 | + HAFGRTR_EL2_AMEVTYPER10_EL0 | + HAFGRTR_EL2_AMEVCNTR115_EL0 | + HAFGRTR_EL2_AMEVCNTR114_EL0 | + HAFGRTR_EL2_AMEVCNTR113_EL0 | + HAFGRTR_EL2_AMEVCNTR112_EL0 | + HAFGRTR_EL2_AMEVCNTR111_EL0 | + HAFGRTR_EL2_AMEVCNTR110_EL0 | + HAFGRTR_EL2_AMEVCNTR19_EL0 | + HAFGRTR_EL2_AMEVCNTR18_EL0 | + HAFGRTR_EL2_AMEVCNTR17_EL0 | + HAFGRTR_EL2_AMEVCNTR16_EL0 | + HAFGRTR_EL2_AMEVCNTR15_EL0 | + HAFGRTR_EL2_AMEVCNTR14_EL0 | + HAFGRTR_EL2_AMEVCNTR13_EL0 | + HAFGRTR_EL2_AMEVCNTR12_EL0 | + HAFGRTR_EL2_AMEVCNTR11_EL0 | + HAFGRTR_EL2_AMEVCNTR10_EL0 | + HAFGRTR_EL2_AMCNTEN1 | + HAFGRTR_EL2_AMCNTEN0 | + HAFGRTR_EL2_AMEVCNTR03_EL0 | + HAFGRTR_EL2_AMEVCNTR02_EL0 | + HAFGRTR_EL2_AMEVCNTR01_EL0 | + HAFGRTR_EL2_AMEVCNTR00_EL0, + FEAT_AMUv1), +}; + +static const DECLARE_FEAT_MAP_FGT(hafgrtr_desc, hafgrtr_masks, + hafgrtr_feat_map, FEAT_FGT); + +static const struct reg_bits_to_feat_map hfgitr2_feat_map[] = { + NEEDS_FEAT(HFGITR2_EL2_nDCCIVAPS, FEAT_PoPS), + NEEDS_FEAT(HFGITR2_EL2_TSBCSYNC, FEAT_TRBEv1p1) +}; + +static const DECLARE_FEAT_MAP_FGT(hfgitr2_desc, hfgitr2_masks, + hfgitr2_feat_map, FEAT_FGT2); + +static const struct reg_bits_to_feat_map hfgrtr2_feat_map[] = { + NEEDS_FEAT(HFGRTR2_EL2_nPFAR_EL1, FEAT_PFAR), + NEEDS_FEAT(HFGRTR2_EL2_nERXGSR_EL1, FEAT_RASv2), + NEEDS_FEAT(HFGRTR2_EL2_nACTLRALIAS_EL1 | + HFGRTR2_EL2_nACTLRMASK_EL1 | + HFGRTR2_EL2_nCPACRALIAS_EL1 | + HFGRTR2_EL2_nCPACRMASK_EL1 | + HFGRTR2_EL2_nSCTLR2MASK_EL1 | + HFGRTR2_EL2_nSCTLRALIAS2_EL1 | + HFGRTR2_EL2_nSCTLRALIAS_EL1 | + HFGRTR2_EL2_nSCTLRMASK_EL1 | + HFGRTR2_EL2_nTCR2ALIAS_EL1 | + HFGRTR2_EL2_nTCR2MASK_EL1 | + HFGRTR2_EL2_nTCRALIAS_EL1 | + HFGRTR2_EL2_nTCRMASK_EL1, + FEAT_SRMASK), + NEEDS_FEAT(HFGRTR2_EL2_nRCWSMASK_EL1, FEAT_THE), +}; + +static const DECLARE_FEAT_MAP_FGT(hfgrtr2_desc, hfgrtr2_masks, + hfgrtr2_feat_map, FEAT_FGT2); + +static const struct reg_bits_to_feat_map hfgwtr2_feat_map[] = { + NEEDS_FEAT(HFGWTR2_EL2_nPFAR_EL1, FEAT_PFAR), + NEEDS_FEAT(HFGWTR2_EL2_nACTLRALIAS_EL1 | + HFGWTR2_EL2_nACTLRMASK_EL1 | + HFGWTR2_EL2_nCPACRALIAS_EL1 | + HFGWTR2_EL2_nCPACRMASK_EL1 | + HFGWTR2_EL2_nSCTLR2MASK_EL1 | + HFGWTR2_EL2_nSCTLRALIAS2_EL1 | + HFGWTR2_EL2_nSCTLRALIAS_EL1 | + HFGWTR2_EL2_nSCTLRMASK_EL1 | + HFGWTR2_EL2_nTCR2ALIAS_EL1 | + HFGWTR2_EL2_nTCR2MASK_EL1 | + HFGWTR2_EL2_nTCRALIAS_EL1 | + HFGWTR2_EL2_nTCRMASK_EL1, + FEAT_SRMASK), + NEEDS_FEAT(HFGWTR2_EL2_nRCWSMASK_EL1, FEAT_THE), +}; + +static const DECLARE_FEAT_MAP_FGT(hfgwtr2_desc, hfgwtr2_masks, + hfgwtr2_feat_map, FEAT_FGT2); + +static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = { + NEEDS_FEAT(HDFGRTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9), + NEEDS_FEAT(HDFGRTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss), + NEEDS_FEAT(HDFGRTR2_EL2_nTRCITECR_EL1, FEAT_ITE), + NEEDS_FEAT(HDFGRTR2_EL2_nPMICFILTR_EL0 | + HDFGRTR2_EL2_nPMICNTR_EL0, + FEAT_PMUv3_ICNTR), + NEEDS_FEAT(HDFGRTR2_EL2_nPMUACR_EL1, feat_pmuv3p9), + NEEDS_FEAT(HDFGRTR2_EL2_nPMSSCR_EL1 | + HDFGRTR2_EL2_nPMSSDATA, + FEAT_PMUv3_SS), + NEEDS_FEAT(HDFGRTR2_EL2_nPMIAR_EL1, FEAT_SEBEP), + NEEDS_FEAT(HDFGRTR2_EL2_nPMSDSFR_EL1, feat_spe_fds), + NEEDS_FEAT(HDFGRTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM), + NEEDS_FEAT(HDFGRTR2_EL2_nSPMACCESSR_EL1 | + HDFGRTR2_EL2_nSPMCNTEN | + HDFGRTR2_EL2_nSPMCR_EL0 | + HDFGRTR2_EL2_nSPMDEVAFF_EL1 | + HDFGRTR2_EL2_nSPMEVCNTRn_EL0 | + HDFGRTR2_EL2_nSPMEVTYPERn_EL0| + HDFGRTR2_EL2_nSPMID | + HDFGRTR2_EL2_nSPMINTEN | + HDFGRTR2_EL2_nSPMOVS | + HDFGRTR2_EL2_nSPMSCR_EL1 | + HDFGRTR2_EL2_nSPMSELR_EL0, + FEAT_SPMU), + NEEDS_FEAT(HDFGRTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2), + NEEDS_FEAT(HDFGRTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam), +}; + +static const DECLARE_FEAT_MAP_FGT(hdfgrtr2_desc, hdfgrtr2_masks, + hdfgrtr2_feat_map, FEAT_FGT2); + +static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = { + NEEDS_FEAT(HDFGWTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9), + NEEDS_FEAT(HDFGWTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss), + NEEDS_FEAT(HDFGWTR2_EL2_nTRCITECR_EL1, FEAT_ITE), + NEEDS_FEAT(HDFGWTR2_EL2_nPMICFILTR_EL0 | + HDFGWTR2_EL2_nPMICNTR_EL0, + FEAT_PMUv3_ICNTR), + NEEDS_FEAT(HDFGWTR2_EL2_nPMUACR_EL1 | + HDFGWTR2_EL2_nPMZR_EL0, + feat_pmuv3p9), + NEEDS_FEAT(HDFGWTR2_EL2_nPMSSCR_EL1, FEAT_PMUv3_SS), + NEEDS_FEAT(HDFGWTR2_EL2_nPMIAR_EL1, FEAT_SEBEP), + NEEDS_FEAT(HDFGWTR2_EL2_nPMSDSFR_EL1, feat_spe_fds), + NEEDS_FEAT(HDFGWTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM), + NEEDS_FEAT(HDFGWTR2_EL2_nSPMACCESSR_EL1 | + HDFGWTR2_EL2_nSPMCNTEN | + HDFGWTR2_EL2_nSPMCR_EL0 | + HDFGWTR2_EL2_nSPMEVCNTRn_EL0 | + HDFGWTR2_EL2_nSPMEVTYPERn_EL0| + HDFGWTR2_EL2_nSPMINTEN | + HDFGWTR2_EL2_nSPMOVS | + HDFGWTR2_EL2_nSPMSCR_EL1 | + HDFGWTR2_EL2_nSPMSELR_EL0, + FEAT_SPMU), + NEEDS_FEAT(HDFGWTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2), + NEEDS_FEAT(HDFGWTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam), +}; + +static const DECLARE_FEAT_MAP_FGT(hdfgwtr2_desc, hdfgwtr2_masks, + hdfgwtr2_feat_map, FEAT_FGT2); + + +static const struct reg_bits_to_feat_map hcrx_feat_map[] = { + NEEDS_FEAT(HCRX_EL2_PACMEn, feat_pauth_lr), + NEEDS_FEAT(HCRX_EL2_EnFPM, FEAT_FPMR), + NEEDS_FEAT(HCRX_EL2_GCSEn, FEAT_GCS), + NEEDS_FEAT(HCRX_EL2_EnIDCP128, FEAT_SYSREG128), + NEEDS_FEAT(HCRX_EL2_EnSDERR, feat_aderr), + NEEDS_FEAT(HCRX_EL2_TMEA, FEAT_DoubleFault2), + NEEDS_FEAT(HCRX_EL2_EnSNERR, feat_anerr), + NEEDS_FEAT(HCRX_EL2_D128En, FEAT_D128), + NEEDS_FEAT(HCRX_EL2_PTTWI, FEAT_THE), + NEEDS_FEAT(HCRX_EL2_SCTLR2En, FEAT_SCTLR2), + NEEDS_FEAT(HCRX_EL2_TCR2En, FEAT_TCR2), + NEEDS_FEAT(HCRX_EL2_MSCEn | + HCRX_EL2_MCE2, + FEAT_MOPS), + NEEDS_FEAT(HCRX_EL2_CMOW, FEAT_CMOW), + NEEDS_FEAT(HCRX_EL2_VFNMI | + HCRX_EL2_VINMI | + HCRX_EL2_TALLINT, + FEAT_NMI), + NEEDS_FEAT(HCRX_EL2_SMPME, feat_sme_smps), + NEEDS_FEAT(HCRX_EL2_FGTnXS | + HCRX_EL2_FnXS, + FEAT_XS), + NEEDS_FEAT(HCRX_EL2_EnASR, FEAT_LS64_V), + NEEDS_FEAT(HCRX_EL2_EnALS, FEAT_LS64), + NEEDS_FEAT(HCRX_EL2_EnAS0, FEAT_LS64_ACCDATA), +}; + + +static const DECLARE_FEAT_MAP(hcrx_desc, __HCRX_EL2, + hcrx_feat_map, FEAT_HCX); + +static const struct reg_bits_to_feat_map hcr_feat_map[] = { + NEEDS_FEAT(HCR_EL2_TID0, FEAT_AA32EL0), + NEEDS_FEAT_FIXED(HCR_EL2_RW, compute_hcr_rw), + NEEDS_FEAT(HCR_EL2_HCD, not_feat_aa64el3), + NEEDS_FEAT(HCR_EL2_AMO | + HCR_EL2_BSU | + HCR_EL2_CD | + HCR_EL2_DC | + HCR_EL2_FB | + HCR_EL2_FMO | + HCR_EL2_ID | + HCR_EL2_IMO | + HCR_EL2_MIOCNCE | + HCR_EL2_PTW | + HCR_EL2_SWIO | + HCR_EL2_TACR | + HCR_EL2_TDZ | + HCR_EL2_TGE | + HCR_EL2_TID1 | + HCR_EL2_TID2 | + HCR_EL2_TID3 | + HCR_EL2_TIDCP | + HCR_EL2_TPCP | + HCR_EL2_TPU | + HCR_EL2_TRVM | + HCR_EL2_TSC | + HCR_EL2_TSW | + HCR_EL2_TTLB | + HCR_EL2_TVM | + HCR_EL2_TWE | + HCR_EL2_TWI | + HCR_EL2_VF | + HCR_EL2_VI | + HCR_EL2_VM | + HCR_EL2_VSE, + FEAT_AA64EL1), + NEEDS_FEAT(HCR_EL2_AMVOFFEN, FEAT_AMUv1p1), + NEEDS_FEAT(HCR_EL2_EnSCXT, feat_csv2_2_csv2_1p2), + NEEDS_FEAT(HCR_EL2_TICAB | + HCR_EL2_TID4 | + HCR_EL2_TOCU, + FEAT_EVT), + NEEDS_FEAT(HCR_EL2_TTLBIS | + HCR_EL2_TTLBOS, + FEAT_EVT_TTLBxS), + NEEDS_FEAT(HCR_EL2_TLOR, FEAT_LOR), + NEEDS_FEAT(HCR_EL2_ATA | + HCR_EL2_DCT | + HCR_EL2_TID5, + FEAT_MTE2), + NEEDS_FEAT(HCR_EL2_AT | /* Ignore the original FEAT_NV */ + HCR_EL2_NV2 | + HCR_EL2_NV, + feat_nv2), + NEEDS_FEAT(HCR_EL2_NV1, feat_nv2_e2h0_ni), /* Missing from JSON */ + NEEDS_FEAT(HCR_EL2_API | + HCR_EL2_APK, + feat_pauth), + NEEDS_FEAT(HCR_EL2_TEA | + HCR_EL2_TERR, + FEAT_RAS), + NEEDS_FEAT(HCR_EL2_FIEN, feat_rasv1p1), + NEEDS_FEAT(HCR_EL2_GPF, FEAT_RME), + NEEDS_FEAT(HCR_EL2_FWB, FEAT_S2FWB), + NEEDS_FEAT(HCR_EL2_TME, FEAT_TME), + NEEDS_FEAT(HCR_EL2_TWEDEL | + HCR_EL2_TWEDEn, + FEAT_TWED), + NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h), +}; + +static const DECLARE_FEAT_MAP(hcr_desc, HCR_EL2, + hcr_feat_map, FEAT_AA64EL2); + +static const struct reg_bits_to_feat_map sctlr2_feat_map[] = { + NEEDS_FEAT(SCTLR2_EL1_NMEA | + SCTLR2_EL1_EASE, + FEAT_DoubleFault2), + NEEDS_FEAT(SCTLR2_EL1_EnADERR, feat_aderr), + NEEDS_FEAT(SCTLR2_EL1_EnANERR, feat_anerr), + NEEDS_FEAT(SCTLR2_EL1_EnIDCP128, FEAT_SYSREG128), + NEEDS_FEAT(SCTLR2_EL1_EnPACM | + SCTLR2_EL1_EnPACM0, + feat_pauth_lr), + NEEDS_FEAT(SCTLR2_EL1_CPTA | + SCTLR2_EL1_CPTA0 | + SCTLR2_EL1_CPTM | + SCTLR2_EL1_CPTM0, + FEAT_CPA2), +}; + +static const DECLARE_FEAT_MAP(sctlr2_desc, SCTLR2_EL1, + sctlr2_feat_map, FEAT_SCTLR2); + +static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = { + NEEDS_FEAT(TCR2_EL2_FNG1 | + TCR2_EL2_FNG0 | + TCR2_EL2_A2, + feat_asid2_e2h1), + NEEDS_FEAT(TCR2_EL2_DisCH1 | + TCR2_EL2_DisCH0 | + TCR2_EL2_D128, + feat_d128_e2h1), + NEEDS_FEAT(TCR2_EL2_AMEC1, feat_mec_e2h1), + NEEDS_FEAT(TCR2_EL2_AMEC0, FEAT_MEC), + NEEDS_FEAT(TCR2_EL2_HAFT, FEAT_HAFT), + NEEDS_FEAT(TCR2_EL2_PTTWI | + TCR2_EL2_PnCH, + FEAT_THE), + NEEDS_FEAT(TCR2_EL2_AIE, FEAT_AIE), + NEEDS_FEAT(TCR2_EL2_POE | + TCR2_EL2_E0POE, + FEAT_S1POE), + NEEDS_FEAT(TCR2_EL2_PIE, FEAT_S1PIE), +}; + +static const DECLARE_FEAT_MAP(tcr2_el2_desc, TCR2_EL2, + tcr2_el2_feat_map, FEAT_TCR2); + +static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { + NEEDS_FEAT(SCTLR_EL1_CP15BEN | + SCTLR_EL1_ITD | + SCTLR_EL1_SED, + FEAT_AA32EL0), + NEEDS_FEAT(SCTLR_EL1_BT0 | + SCTLR_EL1_BT1, + FEAT_BTI), + NEEDS_FEAT(SCTLR_EL1_CMOW, FEAT_CMOW), + NEEDS_FEAT(SCTLR_EL1_TSCXT, feat_csv2_2_csv2_1p2), + NEEDS_FEAT(SCTLR_EL1_EIS | + SCTLR_EL1_EOS, + FEAT_ExS), + NEEDS_FEAT(SCTLR_EL1_EnFPM, FEAT_FPMR), + NEEDS_FEAT(SCTLR_EL1_IESB, FEAT_IESB), + NEEDS_FEAT(SCTLR_EL1_EnALS, FEAT_LS64), + NEEDS_FEAT(SCTLR_EL1_EnAS0, FEAT_LS64_ACCDATA), + NEEDS_FEAT(SCTLR_EL1_EnASR, FEAT_LS64_V), + NEEDS_FEAT(SCTLR_EL1_nAA, FEAT_LSE2), + NEEDS_FEAT(SCTLR_EL1_LSMAOE | + SCTLR_EL1_nTLSMD, + FEAT_LSMAOC), + NEEDS_FEAT(SCTLR_EL1_EE, FEAT_MixedEnd), + NEEDS_FEAT(SCTLR_EL1_E0E, feat_mixedendel0), + NEEDS_FEAT(SCTLR_EL1_MSCEn, FEAT_MOPS), + NEEDS_FEAT(SCTLR_EL1_ATA0 | + SCTLR_EL1_ATA | + SCTLR_EL1_TCF0 | + SCTLR_EL1_TCF, + FEAT_MTE2), + NEEDS_FEAT(SCTLR_EL1_ITFSB, feat_mte_async), + NEEDS_FEAT(SCTLR_EL1_TCSO0 | + SCTLR_EL1_TCSO, + FEAT_MTE_STORE_ONLY), + NEEDS_FEAT(SCTLR_EL1_NMI | + SCTLR_EL1_SPINTMASK, + FEAT_NMI), + NEEDS_FEAT(SCTLR_EL1_SPAN, FEAT_PAN), + NEEDS_FEAT(SCTLR_EL1_EPAN, FEAT_PAN3), + NEEDS_FEAT(SCTLR_EL1_EnDA | + SCTLR_EL1_EnDB | + SCTLR_EL1_EnIA | + SCTLR_EL1_EnIB, + feat_pauth), + NEEDS_FEAT(SCTLR_EL1_EnTP2, FEAT_SME), + NEEDS_FEAT(SCTLR_EL1_EnRCTX, FEAT_SPECRES), + NEEDS_FEAT(SCTLR_EL1_DSSBS, FEAT_SSBS), + NEEDS_FEAT(SCTLR_EL1_TIDCP, FEAT_TIDCP1), + NEEDS_FEAT(SCTLR_EL1_TME0 | + SCTLR_EL1_TME | + SCTLR_EL1_TMT0 | + SCTLR_EL1_TMT, + FEAT_TME), + NEEDS_FEAT(SCTLR_EL1_TWEDEL | + SCTLR_EL1_TWEDEn, + FEAT_TWED), + NEEDS_FEAT(SCTLR_EL1_UCI | + SCTLR_EL1_EE | + SCTLR_EL1_E0E | + SCTLR_EL1_WXN | + SCTLR_EL1_nTWE | + SCTLR_EL1_nTWI | + SCTLR_EL1_UCT | + SCTLR_EL1_DZE | + SCTLR_EL1_I | + SCTLR_EL1_UMA | + SCTLR_EL1_SA0 | + SCTLR_EL1_SA | + SCTLR_EL1_C | + SCTLR_EL1_A | + SCTLR_EL1_M, + FEAT_AA64EL1), +}; + +static const DECLARE_FEAT_MAP(sctlr_el1_desc, SCTLR_EL1, + sctlr_el1_feat_map, FEAT_AA64EL1); + +static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = { + NEEDS_FEAT(MDCR_EL2_EBWE, FEAT_Debugv8p9), + NEEDS_FEAT(MDCR_EL2_TDOSA, FEAT_DoubleLock), + NEEDS_FEAT(MDCR_EL2_PMEE, FEAT_EBEP), + NEEDS_FEAT(MDCR_EL2_TDCC, FEAT_FGT), + NEEDS_FEAT(MDCR_EL2_MTPME, FEAT_MTPMU), + NEEDS_FEAT(MDCR_EL2_HPME | + MDCR_EL2_HPMN | + MDCR_EL2_TPMCR | + MDCR_EL2_TPM, + FEAT_PMUv3), + NEEDS_FEAT(MDCR_EL2_HPMD, feat_pmuv3p1), + NEEDS_FEAT(MDCR_EL2_HCCD | + MDCR_EL2_HLP, + feat_pmuv3p5), + NEEDS_FEAT(MDCR_EL2_HPMFZO, feat_pmuv3p7), + NEEDS_FEAT(MDCR_EL2_PMSSE, FEAT_PMUv3_SS), + NEEDS_FEAT(MDCR_EL2_E2PB | + MDCR_EL2_TPMS, + FEAT_SPE), + NEEDS_FEAT(MDCR_EL2_HPMFZS, FEAT_SPEv1p2), + NEEDS_FEAT(MDCR_EL2_EnSPM, FEAT_SPMU), + NEEDS_FEAT(MDCR_EL2_EnSTEPOP, FEAT_STEP2), + NEEDS_FEAT(MDCR_EL2_E2TB, FEAT_TRBE), + NEEDS_FEAT(MDCR_EL2_TTRF, FEAT_TRF), + NEEDS_FEAT(MDCR_EL2_TDA | + MDCR_EL2_TDE | + MDCR_EL2_TDRA, + FEAT_AA64EL1), +}; + +static const DECLARE_FEAT_MAP(mdcr_el2_desc, MDCR_EL2, + mdcr_el2_feat_map, FEAT_AA64EL2); + +static void __init check_feat_map(const struct reg_bits_to_feat_map *map, + int map_size, u64 res0, const char *str) +{ + u64 mask = 0; + + for (int i = 0; i < map_size; i++) + mask |= map[i].bits; + + if (mask != ~res0) + kvm_err("Undefined %s behaviour, bits %016llx\n", + str, mask ^ ~res0); +} + +static u64 reg_feat_map_bits(const struct reg_bits_to_feat_map *map) +{ + return map->flags & RES0_POINTER ? ~(*map->res0p) : map->bits; +} + +static void __init check_reg_desc(const struct reg_feat_map_desc *r) +{ + check_feat_map(r->bit_feat_map, r->bit_feat_map_sz, + ~reg_feat_map_bits(&r->feat_map), r->name); +} + +void __init check_feature_map(void) +{ + check_reg_desc(&hfgrtr_desc); + check_reg_desc(&hfgwtr_desc); + check_reg_desc(&hfgitr_desc); + check_reg_desc(&hdfgrtr_desc); + check_reg_desc(&hdfgwtr_desc); + check_reg_desc(&hafgrtr_desc); + check_reg_desc(&hfgrtr2_desc); + check_reg_desc(&hfgwtr2_desc); + check_reg_desc(&hfgitr2_desc); + check_reg_desc(&hdfgrtr2_desc); + check_reg_desc(&hdfgwtr2_desc); + check_reg_desc(&hcrx_desc); + check_reg_desc(&hcr_desc); + check_reg_desc(&sctlr2_desc); + check_reg_desc(&tcr2_el2_desc); + check_reg_desc(&sctlr_el1_desc); + check_reg_desc(&mdcr_el2_desc); +} + +static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map) +{ + u64 regval = kvm->arch.id_regs[map->regidx]; + u64 regfld = (regval >> map->shift) & GENMASK(map->width - 1, 0); + + if (map->sign) { + s64 sfld = sign_extend64(regfld, map->width - 1); + s64 slim = sign_extend64(map->lo_lim, map->width - 1); + return sfld >= slim; + } else { + return regfld >= map->lo_lim; + } +} + +static u64 __compute_fixed_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + u64 *fixed_bits, + unsigned long require, + unsigned long exclude) +{ + u64 val = 0; + + for (int i = 0; i < map_size; i++) { + bool match; + + if ((map[i].flags & require) != require) + continue; + + if (map[i].flags & exclude) + continue; + + if (map[i].flags & CALL_FUNC) + match = (map[i].flags & FIXED_VALUE) ? + map[i].fval(kvm, fixed_bits) : + map[i].match(kvm); + else + match = idreg_feat_match(kvm, &map[i]); + + if (!match || (map[i].flags & FIXED_VALUE)) + val |= reg_feat_map_bits(&map[i]); + } + + return val; +} + +static u64 compute_res0_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + unsigned long require, + unsigned long exclude) +{ + return __compute_fixed_bits(kvm, map, map_size, NULL, + require, exclude | FIXED_VALUE); +} + +static u64 compute_reg_res0_bits(struct kvm *kvm, + const struct reg_feat_map_desc *r, + unsigned long require, unsigned long exclude) + +{ + u64 res0; + + res0 = compute_res0_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, + require, exclude); + + /* + * If computing FGUs, don't take RES0 or register existence + * into account -- we're not computing bits for the register + * itself. + */ + if (!(exclude & NEVER_FGU)) { + res0 |= compute_res0_bits(kvm, &r->feat_map, 1, require, exclude); + res0 |= ~reg_feat_map_bits(&r->feat_map); + } + + return res0; +} + +static u64 compute_reg_fixed_bits(struct kvm *kvm, + const struct reg_feat_map_desc *r, + u64 *fixed_bits, unsigned long require, + unsigned long exclude) +{ + return __compute_fixed_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, + fixed_bits, require | FIXED_VALUE, exclude); +} + +void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) +{ + u64 val = 0; + + switch (fgt) { + case HFGRTR_GROUP: + val |= compute_reg_res0_bits(kvm, &hfgrtr_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgwtr_desc, + 0, NEVER_FGU); + break; + case HFGITR_GROUP: + val |= compute_reg_res0_bits(kvm, &hfgitr_desc, + 0, NEVER_FGU); + break; + case HDFGRTR_GROUP: + val |= compute_reg_res0_bits(kvm, &hdfgrtr_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hdfgwtr_desc, + 0, NEVER_FGU); + break; + case HAFGRTR_GROUP: + val |= compute_reg_res0_bits(kvm, &hafgrtr_desc, + 0, NEVER_FGU); + break; + case HFGRTR2_GROUP: + val |= compute_reg_res0_bits(kvm, &hfgrtr2_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgwtr2_desc, + 0, NEVER_FGU); + break; + case HFGITR2_GROUP: + val |= compute_reg_res0_bits(kvm, &hfgitr2_desc, + 0, NEVER_FGU); + break; + case HDFGRTR2_GROUP: + val |= compute_reg_res0_bits(kvm, &hdfgrtr2_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hdfgwtr2_desc, + 0, NEVER_FGU); + break; + default: + BUG(); + } + + kvm->arch.fgu[fgt] = val; +} + +void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1) +{ + u64 fixed = 0, mask; + + switch (reg) { + case HFGRTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgrtr_desc, 0, 0); + *res1 = HFGRTR_EL2_RES1; + break; + case HFGWTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgwtr_desc, 0, 0); + *res1 = HFGWTR_EL2_RES1; + break; + case HFGITR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgitr_desc, 0, 0); + *res1 = HFGITR_EL2_RES1; + break; + case HDFGRTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hdfgrtr_desc, 0, 0); + *res1 = HDFGRTR_EL2_RES1; + break; + case HDFGWTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hdfgwtr_desc, 0, 0); + *res1 = HDFGWTR_EL2_RES1; + break; + case HAFGRTR_EL2: + *res0 = compute_reg_res0_bits(kvm, &hafgrtr_desc, 0, 0); + *res1 = HAFGRTR_EL2_RES1; + break; + case HFGRTR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgrtr2_desc, 0, 0); + *res1 = HFGRTR2_EL2_RES1; + break; + case HFGWTR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgwtr2_desc, 0, 0); + *res1 = HFGWTR2_EL2_RES1; + break; + case HFGITR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hfgitr2_desc, 0, 0); + *res1 = HFGITR2_EL2_RES1; + break; + case HDFGRTR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hdfgrtr2_desc, 0, 0); + *res1 = HDFGRTR2_EL2_RES1; + break; + case HDFGWTR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &hdfgwtr2_desc, 0, 0); + *res1 = HDFGWTR2_EL2_RES1; + break; + case HCRX_EL2: + *res0 = compute_reg_res0_bits(kvm, &hcrx_desc, 0, 0); + *res1 = __HCRX_EL2_RES1; + break; + case HCR_EL2: + mask = compute_reg_fixed_bits(kvm, &hcr_desc, &fixed, 0, 0); + *res0 = compute_reg_res0_bits(kvm, &hcr_desc, 0, 0); + *res0 |= (mask & ~fixed); + *res1 = HCR_EL2_RES1 | (mask & fixed); + break; + case SCTLR2_EL1: + case SCTLR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &sctlr2_desc, 0, 0); + *res1 = SCTLR2_EL1_RES1; + break; + case TCR2_EL2: + *res0 = compute_reg_res0_bits(kvm, &tcr2_el2_desc, 0, 0); + *res1 = TCR2_EL2_RES1; + break; + case SCTLR_EL1: + *res0 = compute_reg_res0_bits(kvm, &sctlr_el1_desc, 0, 0); + *res1 = SCTLR_EL1_RES1; + break; + case MDCR_EL2: + *res0 = compute_reg_res0_bits(kvm, &mdcr_el2_desc, 0, 0); + *res1 = MDCR_EL2_RES1; + break; + default: + WARN_ON_ONCE(1); + *res0 = *res1 = 0; + break; + } +} + +static __always_inline struct fgt_masks *__fgt_reg_to_masks(enum vcpu_sysreg reg) +{ + switch (reg) { + case HFGRTR_EL2: + return &hfgrtr_masks; + case HFGWTR_EL2: + return &hfgwtr_masks; + case HFGITR_EL2: + return &hfgitr_masks; + case HDFGRTR_EL2: + return &hdfgrtr_masks; + case HDFGWTR_EL2: + return &hdfgwtr_masks; + case HAFGRTR_EL2: + return &hafgrtr_masks; + case HFGRTR2_EL2: + return &hfgrtr2_masks; + case HFGWTR2_EL2: + return &hfgwtr2_masks; + case HFGITR2_EL2: + return &hfgitr2_masks; + case HDFGRTR2_EL2: + return &hdfgrtr2_masks; + case HDFGWTR2_EL2: + return &hdfgwtr2_masks; + default: + BUILD_BUG_ON(1); + } +} + +static __always_inline void __compute_fgt(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg) +{ + u64 fgu = vcpu->kvm->arch.fgu[__fgt_reg_to_group_id(reg)]; + struct fgt_masks *m = __fgt_reg_to_masks(reg); + u64 clear = 0, set = 0, val = m->nmask; + + set |= fgu & m->mask; + clear |= fgu & m->nmask; + + if (is_nested_ctxt(vcpu)) { + u64 nested = __vcpu_sys_reg(vcpu, reg); + set |= nested & m->mask; + clear |= ~nested & m->nmask; + } + + val |= set; + val &= ~clear; + *vcpu_fgt(vcpu, reg) = val; +} + +static void __compute_hfgwtr(struct kvm_vcpu *vcpu) +{ + __compute_fgt(vcpu, HFGWTR_EL2); + + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + *vcpu_fgt(vcpu, HFGWTR_EL2) |= HFGWTR_EL2_TCR_EL1; +} + +static void __compute_hdfgwtr(struct kvm_vcpu *vcpu) +{ + __compute_fgt(vcpu, HDFGWTR_EL2); + + if (is_hyp_ctxt(vcpu)) + *vcpu_fgt(vcpu, HDFGWTR_EL2) |= HDFGWTR_EL2_MDSCR_EL1; +} + +void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu) +{ + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + return; + + __compute_fgt(vcpu, HFGRTR_EL2); + __compute_hfgwtr(vcpu); + __compute_fgt(vcpu, HFGITR_EL2); + __compute_fgt(vcpu, HDFGRTR_EL2); + __compute_hdfgwtr(vcpu); + __compute_fgt(vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __compute_fgt(vcpu, HFGRTR2_EL2); + __compute_fgt(vcpu, HFGWTR2_EL2); + __compute_fgt(vcpu, HFGITR2_EL2); + __compute_fgt(vcpu, HDFGRTR2_EL2); + __compute_fgt(vcpu, HDFGWTR2_EL2); +} diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index fccf9ec01813..3ad6b7c6e4ba 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -3,7 +3,8 @@ * Debug and Guest Debug support * * Copyright (C) 2015 - Linaro Ltd - * Author: Alex Bennée <alex.bennee@linaro.org> + * Authors: Alex Bennée <alex.bennee@linaro.org> + * Oliver Upton <oliver.upton@linux.dev> */ #include <linux/kvm_host.h> @@ -14,70 +15,10 @@ #include <asm/kvm_arm.h> #include <asm/kvm_emulate.h> -#include "trace.h" - -/* These are the bits of MDSCR_EL1 we may manipulate */ -#define MDSCR_EL1_DEBUG_MASK (DBG_MDSCR_SS | \ - DBG_MDSCR_KDE | \ - DBG_MDSCR_MDE) - -static DEFINE_PER_CPU(u64, mdcr_el2); - -/** - * save/restore_guest_debug_regs - * - * For some debug operations we need to tweak some guest registers. As - * a result we need to save the state of those registers before we - * make those modifications. - * - * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled - * after we have restored the preserved value to the main context. - * - * When single-step is enabled by userspace, we tweak PSTATE.SS on every - * guest entry. Preserve PSTATE.SS so we can restore the original value - * for the vcpu after the single-step is disabled. - */ -static void save_guest_debug_regs(struct kvm_vcpu *vcpu) -{ - u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - - vcpu->arch.guest_debug_preserved.mdscr_el1 = val; - - trace_kvm_arm_set_dreg32("Saved MDSCR_EL1", - vcpu->arch.guest_debug_preserved.mdscr_el1); - - vcpu->arch.guest_debug_preserved.pstate_ss = - (*vcpu_cpsr(vcpu) & DBG_SPSR_SS); -} - -static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) -{ - u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1; - - vcpu_write_sys_reg(vcpu, val, MDSCR_EL1); - - trace_kvm_arm_set_dreg32("Restored MDSCR_EL1", - vcpu_read_sys_reg(vcpu, MDSCR_EL1)); - - if (vcpu->arch.guest_debug_preserved.pstate_ss) - *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; - else - *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; -} - -/** - * kvm_arm_init_debug - grab what we need for debug - * - * Currently the sole task of this function is to retrieve the initial - * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has - * presumably been set-up by some knowledgeable bootcode. - * - * It is called once per-cpu during CPU hyp initialisation. - */ - -void kvm_arm_init_debug(void) +static int cpu_has_spe(u64 dfr0) { - __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2)); + return cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && + !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P); } /** @@ -95,11 +36,14 @@ void kvm_arm_init_debug(void) */ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) { + preempt_disable(); + /* * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK * to disable guest access to the profiling and trace buffers */ - vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; + vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, + *host_data_ptr(nr_event_counters)); vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMS | MDCR_EL2_TTRF | @@ -113,232 +57,234 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; /* - * Trap debug register access when one of the following is true: - * - Userspace is using the hardware to debug the guest - * (KVM_GUESTDBG_USE_HW is set). - * - The guest is not using debug (DEBUG_DIRTY clear). - * - The guest has enabled the OS Lock (debug exceptions are blocked). + * Trap debug registers if the guest doesn't have ownership of them. */ - if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) || - !vcpu_get_flag(vcpu, DEBUG_DIRTY) || - kvm_vcpu_os_lock_enabled(vcpu)) + if (!kvm_guest_owns_debug_regs(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; - trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); -} + if (vcpu_has_nv(vcpu)) + kvm_nested_setup_mdcr_el2(vcpu); + + /* Write MDCR_EL2 directly if we're already at EL2 */ + if (has_vhe()) + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); -/** - * kvm_arm_vcpu_init_debug - setup vcpu debug traps - * - * @vcpu: the vcpu pointer - * - * Set vcpu initial mdcr_el2 value. - */ -void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - kvm_arm_setup_mdcr_el2(vcpu); preempt_enable(); } -/** - * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state - */ +void kvm_init_host_debug_data(void) +{ + u64 dfr0 = read_sysreg(id_aa64dfr0_el1); + + if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0) + *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, + read_sysreg(pmcr_el0)); + + *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0); + *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0); + + if (cpu_has_spe(dfr0)) + host_data_set_flag(HAS_SPE); -void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) + if (has_vhe()) + return; + + /* Check if we have BRBE implemented and available at the host */ + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT)) + host_data_set_flag(HAS_BRBE); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) { + /* Force disable trace in protected mode in case of no TRBE */ + if (is_protected_kvm_enabled()) + host_data_set_flag(EL1_TRACING_CONFIGURED); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && + !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) + host_data_set_flag(HAS_TRBE); + } +} + +void kvm_debug_init_vhe(void) { - vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state; + /* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */ + if (host_data_test_flag(HAS_SPE)) + write_sysreg_el1(0, SYS_PMSCR); } -/** - * kvm_arm_setup_debug - set up debug related stuff +/* + * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host + * has taken over MDSCR_EL1. * - * @vcpu: the vcpu pointer + * - Userspace is single-stepping the guest, and MDSCR_EL1.SS is forced to 1. * - * This is called before each entry into the hypervisor to setup any - * debug related registers. + * - Userspace is using the breakpoint/watchpoint registers to debug the + * guest, and MDSCR_EL1.MDE is forced to 1. * - * Additionally, KVM only traps guest accesses to the debug registers if - * the guest is not actively using them (see the DEBUG_DIRTY - * flag on vcpu->arch.iflags). Since the guest must not interfere - * with the hardware state when debugging the guest, we must ensure that - * trapping is enabled whenever we are debugging the guest using the - * debug registers. + * - The guest has enabled the OS Lock, and KVM is forcing MDSCR_EL1.MDE to 0, + * masking all debug exceptions affected by the OS Lock. */ +static void setup_external_mdscr(struct kvm_vcpu *vcpu) +{ + /* + * Use the guest's MDSCR_EL1 as a starting point, since there are + * several other features controlled by MDSCR_EL1 that are not relevant + * to the host. + * + * Clear the bits that KVM may use which also satisfies emulation of + * the OS Lock as MDSCR_EL1.MDE is cleared. + */ + u64 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1) & ~(MDSCR_EL1_SS | + MDSCR_EL1_MDE | + MDSCR_EL1_KDE); + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + mdscr |= MDSCR_EL1_SS; + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + mdscr |= MDSCR_EL1_MDE | MDSCR_EL1_KDE; -void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) + vcpu->arch.external_mdscr_el1 = mdscr; +} + +void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) { - unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2; + u64 mdscr; - trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug); + /* Must be called before kvm_vcpu_load_vhe() */ + KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); - kvm_arm_setup_mdcr_el2(vcpu); + if (has_vhe()) + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); - /* Check if we need to use the debug registers. */ + /* + * Determine which of the possible debug states we're in: + * + * - VCPU_DEBUG_HOST_OWNED: KVM has taken ownership of the guest's + * breakpoint/watchpoint registers, or needs to use MDSCR_EL1 to do + * software step or emulate the effects of the OS Lock being enabled. + * + * - VCPU_DEBUG_GUEST_OWNED: The guest has debug exceptions enabled, and + * the breakpoint/watchpoint registers need to be loaded eagerly. + * + * - VCPU_DEBUG_FREE: Neither of the above apply, no breakpoint/watchpoint + * context needs to be loaded on the CPU. + */ if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { - /* Save guest debug state */ - save_guest_debug_regs(vcpu); + vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED; + setup_external_mdscr(vcpu); /* - * Single Step (ARM ARM D2.12.3 The software step state - * machine) - * - * If we are doing Single Step we need to manipulate - * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the - * step has occurred the hypervisor will trap the - * debug exception and we return to userspace. - * - * If the guest attempts to single step its userspace - * we would have to deal with a trapped exception - * while in the guest kernel. Because this would be - * hard to unwind we suppress the guest's ability to - * do so by masking MDSCR_EL.SS. - * - * This confuses guest debuggers which use - * single-step behind the scenes but everything - * returns to normal once the host is no longer - * debugging the system. + * Steal the guest's single-step state machine if userspace wants + * single-step the guest. */ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - /* - * If the software step state at the last guest exit - * was Active-pending, we don't set DBG_SPSR_SS so - * that the state is maintained (to not run another - * single-step until the pending Software Step - * exception is taken). - */ - if (!vcpu_get_flag(vcpu, DBG_SS_ACTIVE_PENDING)) + if (*vcpu_cpsr(vcpu) & DBG_SPSR_SS) + vcpu_clear_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + else + vcpu_set_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + + if (!vcpu_get_flag(vcpu, HOST_SS_ACTIVE_PENDING)) *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; else *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; - - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr |= DBG_MDSCR_SS; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - } else { - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr &= ~DBG_MDSCR_SS; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); } + } else { + mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu)); - - /* - * HW Breakpoints and watchpoints - * - * We simply switch the debug_ptr to point to our new - * external_debug_state which has been populated by the - * debug ioctl. The existing DEBUG_DIRTY mechanism ensures - * the registers are updated on the world switch. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - /* Enable breakpoints/watchpoints */ - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr |= DBG_MDSCR_MDE; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - - vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; - vcpu_set_flag(vcpu, DEBUG_DIRTY); - - trace_kvm_arm_set_regset("BKPTS", get_num_brps(), - &vcpu->arch.debug_ptr->dbg_bcr[0], - &vcpu->arch.debug_ptr->dbg_bvr[0]); - - trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), - &vcpu->arch.debug_ptr->dbg_wcr[0], - &vcpu->arch.debug_ptr->dbg_wvr[0]); - - /* - * The OS Lock blocks debug exceptions in all ELs when it is - * enabled. If the guest has enabled the OS Lock, constrain its - * effects to the guest. Emulate the behavior by clearing - * MDSCR_EL1.MDE. In so doing, we ensure that host debug - * exceptions are unaffected by guest configuration of the OS - * Lock. - */ - } else if (kvm_vcpu_os_lock_enabled(vcpu)) { - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr &= ~DBG_MDSCR_MDE; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - } + if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE)) + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + else + vcpu->arch.debug_owner = VCPU_DEBUG_FREE; } - BUG_ON(!vcpu->guest_debug && - vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state); - - /* If KDE or MDE are set, perform a full save/restore cycle. */ - if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) - vcpu_set_flag(vcpu, DEBUG_DIRTY); - - /* Write mdcr_el2 changes since vcpu_load on VHE systems */ - if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2) - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - - trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1)); + kvm_arm_setup_mdcr_el2(vcpu); } -void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) +void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) { - trace_kvm_arm_clear_debug(vcpu->guest_debug); + if (has_vhe()) + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + + if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + return; /* - * Restore the guest's debug registers if we were using them. + * Save the host's software step state and restore the guest's before + * potentially returning to userspace. */ - if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) - /* - * Mark the vcpu as ACTIVE_PENDING - * until Software Step exception is taken. - */ - vcpu_set_flag(vcpu, DBG_SS_ACTIVE_PENDING); - } + if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) + vcpu_set_flag(vcpu, HOST_SS_ACTIVE_PENDING); + else + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); + + if (vcpu_get_flag(vcpu, GUEST_SS_ACTIVE_PENDING)) + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; +} - restore_guest_debug_regs(vcpu); +/* + * Updates ownership of the debug registers after a trapped guest access to a + * breakpoint/watchpoint register. Host ownership of the debug registers is of + * strictly higher priority, and it is the responsibility of the VMM to emulate + * guest debug exceptions in this configuration. + */ +void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) +{ + if (kvm_host_owns_debug_regs(vcpu)) + return; - /* - * If we were using HW debug we need to restore the - * debug_ptr to the guest debug state. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - kvm_arm_reset_debug_ptr(vcpu); + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + kvm_arm_setup_mdcr_el2(vcpu); +} - trace_kvm_arm_set_regset("BKPTS", get_num_brps(), - &vcpu->arch.debug_ptr->dbg_bcr[0], - &vcpu->arch.debug_ptr->dbg_bvr[0]); +void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) +{ + if (val & OSLAR_EL1_OSLK) + __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, |=, OSLSR_EL1_OSLK); + else + __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, &=, ~OSLSR_EL1_OSLK); - trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), - &vcpu->arch.debug_ptr->dbg_wcr[0], - &vcpu->arch.debug_ptr->dbg_wvr[0]); - } - } + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); } -void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu) +static bool skip_trbe_access(bool skip_condition) { - u64 dfr0; + return (WARN_ON_ONCE(preemptible()) || skip_condition || + is_protected_kvm_enabled() || !is_kvm_arm_initialised()); +} - /* For VHE, there is nothing to do */ - if (has_vhe()) - return; +void kvm_enable_trbe(void) +{ + if (!skip_trbe_access(has_vhe())) + host_data_set_flag(TRBE_ENABLED); +} +EXPORT_SYMBOL_GPL(kvm_enable_trbe); - dfr0 = read_sysreg(id_aa64dfr0_el1); - /* - * If SPE is present on this CPU and is available at current EL, - * we may need to check if the host state needs to be saved. - */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && - !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT))) - vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_SPE); - - /* Check if we have TRBE implemented and available at the host */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && - !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG)) - vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_TRBE); +void kvm_disable_trbe(void) +{ + if (!skip_trbe_access(has_vhe())) + host_data_clear_flag(TRBE_ENABLED); } +EXPORT_SYMBOL_GPL(kvm_disable_trbe); -void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) +void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) { - vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE); - vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE); + if (skip_trbe_access(false)) + return; + + if (has_vhe()) { + write_sysreg_s(trfcr_while_in_guest, SYS_TRFCR_EL12); + return; + } + + *host_data_ptr(trfcr_while_in_guest) = trfcr_while_in_guest; + if (read_sysreg_s(SYS_TRFCR_EL1) != trfcr_while_in_guest) + host_data_set_flag(EL1_TRACING_CONFIGURED); + else + host_data_clear_flag(EL1_TRACING_CONFIGURED); } +EXPORT_SYMBOL_GPL(kvm_tracing_set_el1_configuration); diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c new file mode 100644 index 000000000000..834f13fb1fb7 --- /dev/null +++ b/arch/arm64/kvm/emulate-nested.c @@ -0,0 +1,2854 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 - Linaro and Columbia University + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> + +#include "hyp/include/hyp/adjust_pc.h" + +#include "trace.h" + +enum trap_behaviour { + BEHAVE_HANDLE_LOCALLY = 0, + + BEHAVE_FORWARD_READ = BIT(0), + BEHAVE_FORWARD_WRITE = BIT(1), + BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + + /* Traps that take effect in Host EL0, this is rare! */ + BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2), +}; + +struct trap_bits { + const enum vcpu_sysreg index; + const enum trap_behaviour behaviour; + const u64 value; + const u64 mask; +}; + +/* Coarse Grained Trap definitions */ +enum cgt_group_id { + /* Indicates no coarse trap control */ + __RESERVED__, + + /* + * The first batch of IDs denote coarse trapping that are used + * on their own instead of being part of a combination of + * trap controls. + */ + CGT_HCR_TID1, + CGT_HCR_TID2, + CGT_HCR_TID3, + CGT_HCR_IMO, + CGT_HCR_FMO, + CGT_HCR_TIDCP, + CGT_HCR_TACR, + CGT_HCR_TSW, + CGT_HCR_TPC, + CGT_HCR_TPU, + CGT_HCR_TTLB, + CGT_HCR_TVM, + CGT_HCR_TDZ, + CGT_HCR_TRVM, + CGT_HCR_TLOR, + CGT_HCR_TERR, + CGT_HCR_APK, + CGT_HCR_NV, + CGT_HCR_NV_nNV2, + CGT_HCR_NV1_nNV2, + CGT_HCR_AT, + CGT_HCR_nFIEN, + CGT_HCR_TID4, + CGT_HCR_TICAB, + CGT_HCR_TOCU, + CGT_HCR_ENSCXT, + CGT_HCR_TTLBIS, + CGT_HCR_TTLBOS, + + CGT_MDCR_TPMCR, + CGT_MDCR_TPM, + CGT_MDCR_TDE, + CGT_MDCR_TDA, + CGT_MDCR_TDOSA, + CGT_MDCR_TDRA, + CGT_MDCR_E2PB, + CGT_MDCR_TPMS, + CGT_MDCR_TTRF, + CGT_MDCR_E2TB, + CGT_MDCR_TDCC, + + CGT_CPTR_TAM, + CGT_CPTR_TCPAC, + + CGT_HCRX_EnFPM, + CGT_HCRX_TCR2En, + CGT_HCRX_SCTLR2En, + + CGT_CNTHCTL_EL1TVT, + CGT_CNTHCTL_EL1TVCT, + + CGT_ICH_HCR_TC, + CGT_ICH_HCR_TALL0, + CGT_ICH_HCR_TALL1, + CGT_ICH_HCR_TDIR, + + /* + * Anything after this point is a combination of coarse trap + * controls, which must all be evaluated to decide what to do. + */ + __MULTIPLE_CONTROL_BITS__, + CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__, + CGT_HCR_TID2_TID4, + CGT_HCR_TTLB_TTLBIS, + CGT_HCR_TTLB_TTLBOS, + CGT_HCR_TVM_TRVM, + CGT_HCR_TVM_TRVM_HCRX_TCR2En, + CGT_HCR_TVM_TRVM_HCRX_SCTLR2En, + CGT_HCR_TPU_TICAB, + CGT_HCR_TPU_TOCU, + CGT_HCR_NV1_nNV2_ENSCXT, + CGT_MDCR_TPM_TPMCR, + CGT_MDCR_TPM_HPMN, + CGT_MDCR_TDE_TDA, + CGT_MDCR_TDE_TDOSA, + CGT_MDCR_TDE_TDRA, + CGT_MDCR_TDCC_TDE_TDA, + + CGT_ICH_HCR_TC_TDIR, + + /* + * Anything after this point requires a callback evaluating a + * complex trap condition. Ugly stuff. + */ + __COMPLEX_CONDITIONS__, + CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__, + CGT_CNTHCTL_EL1PTEN, + CGT_CNTHCTL_EL1NVPCT, + CGT_CNTHCTL_EL1NVVCT, + + CGT_CPTR_TTA, + CGT_MDCR_HPMN, + + /* Must be last */ + __NR_CGT_GROUP_IDS__ +}; + +static const struct trap_bits coarse_trap_bits[] = { + [CGT_HCR_TID1] = { + .index = HCR_EL2, + .value = HCR_TID1, + .mask = HCR_TID1, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_HCR_TID2] = { + .index = HCR_EL2, + .value = HCR_TID2, + .mask = HCR_TID2, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TID3] = { + .index = HCR_EL2, + .value = HCR_TID3, + .mask = HCR_TID3, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_HCR_IMO] = { + .index = HCR_EL2, + .value = HCR_IMO, + .mask = HCR_IMO, + .behaviour = BEHAVE_FORWARD_WRITE, + }, + [CGT_HCR_FMO] = { + .index = HCR_EL2, + .value = HCR_FMO, + .mask = HCR_FMO, + .behaviour = BEHAVE_FORWARD_WRITE, + }, + [CGT_HCR_TIDCP] = { + .index = HCR_EL2, + .value = HCR_TIDCP, + .mask = HCR_TIDCP, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TACR] = { + .index = HCR_EL2, + .value = HCR_TACR, + .mask = HCR_TACR, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TSW] = { + .index = HCR_EL2, + .value = HCR_TSW, + .mask = HCR_TSW, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */ + .index = HCR_EL2, + .value = HCR_TPC, + .mask = HCR_TPC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TPU] = { + .index = HCR_EL2, + .value = HCR_TPU, + .mask = HCR_TPU, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TTLB] = { + .index = HCR_EL2, + .value = HCR_TTLB, + .mask = HCR_TTLB, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TVM] = { + .index = HCR_EL2, + .value = HCR_TVM, + .mask = HCR_TVM, + .behaviour = BEHAVE_FORWARD_WRITE, + }, + [CGT_HCR_TDZ] = { + .index = HCR_EL2, + .value = HCR_TDZ, + .mask = HCR_TDZ, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TRVM] = { + .index = HCR_EL2, + .value = HCR_TRVM, + .mask = HCR_TRVM, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_HCR_TLOR] = { + .index = HCR_EL2, + .value = HCR_TLOR, + .mask = HCR_TLOR, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TERR] = { + .index = HCR_EL2, + .value = HCR_TERR, + .mask = HCR_TERR, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_APK] = { + .index = HCR_EL2, + .value = 0, + .mask = HCR_APK, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_NV] = { + .index = HCR_EL2, + .value = HCR_NV, + .mask = HCR_NV, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_NV_nNV2] = { + .index = HCR_EL2, + .value = HCR_NV, + .mask = HCR_NV | HCR_NV2, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_NV1_nNV2] = { + .index = HCR_EL2, + .value = HCR_NV | HCR_NV1, + .mask = HCR_NV | HCR_NV1 | HCR_NV2, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_AT] = { + .index = HCR_EL2, + .value = HCR_AT, + .mask = HCR_AT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_nFIEN] = { + .index = HCR_EL2, + .value = 0, + .mask = HCR_FIEN, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TID4] = { + .index = HCR_EL2, + .value = HCR_TID4, + .mask = HCR_TID4, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TICAB] = { + .index = HCR_EL2, + .value = HCR_TICAB, + .mask = HCR_TICAB, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TOCU] = { + .index = HCR_EL2, + .value = HCR_TOCU, + .mask = HCR_TOCU, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_ENSCXT] = { + .index = HCR_EL2, + .value = 0, + .mask = HCR_ENSCXT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TTLBIS] = { + .index = HCR_EL2, + .value = HCR_TTLBIS, + .mask = HCR_TTLBIS, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCR_TTLBOS] = { + .index = HCR_EL2, + .value = HCR_TTLBOS, + .mask = HCR_TTLBOS, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TPMCR] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TPMCR, + .mask = MDCR_EL2_TPMCR, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, + }, + [CGT_MDCR_TPM] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TPM, + .mask = MDCR_EL2_TPM, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, + }, + [CGT_MDCR_TDE] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDE, + .mask = MDCR_EL2_TDE, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TDA] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDA, + .mask = MDCR_EL2_TDA, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TDOSA] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDOSA, + .mask = MDCR_EL2_TDOSA, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TDRA] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDRA, + .mask = MDCR_EL2_TDRA, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_E2PB] = { + .index = MDCR_EL2, + .value = 0, + .mask = BIT(MDCR_EL2_E2PB_SHIFT), + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TPMS] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TPMS, + .mask = MDCR_EL2_TPMS, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TTRF] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TTRF, + .mask = MDCR_EL2_TTRF, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_E2TB] = { + .index = MDCR_EL2, + .value = 0, + .mask = BIT(MDCR_EL2_E2TB_SHIFT), + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_MDCR_TDCC] = { + .index = MDCR_EL2, + .value = MDCR_EL2_TDCC, + .mask = MDCR_EL2_TDCC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CPTR_TAM] = { + .index = CPTR_EL2, + .value = CPTR_EL2_TAM, + .mask = CPTR_EL2_TAM, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CPTR_TCPAC] = { + .index = CPTR_EL2, + .value = CPTR_EL2_TCPAC, + .mask = CPTR_EL2_TCPAC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCRX_EnFPM] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_EnFPM, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCRX_TCR2En] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_TCR2En, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCRX_SCTLR2En] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_SCTLR2En, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVT, + .mask = CNTHCTL_EL1TVT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVCT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVCT, + .mask = CNTHCTL_EL1TVCT, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_ICH_HCR_TC] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TC, + .mask = ICH_HCR_EL2_TC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TALL0] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TALL0, + .mask = ICH_HCR_EL2_TALL0, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TALL1] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TALL1, + .mask = ICH_HCR_EL2_TALL1, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TDIR] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_EL2_TDIR, + .mask = ICH_HCR_EL2_TDIR, + .behaviour = BEHAVE_FORWARD_RW, + }, +}; + +#define MCB(id, ...) \ + [id - __MULTIPLE_CONTROL_BITS__] = \ + (const enum cgt_group_id[]){ \ + __VA_ARGS__, __RESERVED__ \ + } + +static const enum cgt_group_id *coarse_control_combo[] = { + MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4), + MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS), + MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS), + MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM), + MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En, + CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En), + MCB(CGT_HCR_TVM_TRVM_HCRX_SCTLR2En, + CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_SCTLR2En), + MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB), + MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU), + MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT), + MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR), + MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN), + MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA), + MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), + MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), + MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA), + + MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC), + MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR), +}; + +typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); + +/* + * Warning, maximum confusion ahead. + * + * When E2H=0, CNTHCTL_EL2[1:0] are defined as EL1PCEN:EL1PCTEN + * When E2H=1, CNTHCTL_EL2[11:10] are defined as EL1PTEN:EL1PCTEN + * + * Note the single letter difference? Yet, the bits have the same + * function despite a different layout and a different name. + * + * We don't try to reconcile this mess. We just use the E2H=0 bits + * to generate something that is in the E2H=1 format, and live with + * it. You're welcome. + */ +static u64 get_sanitized_cnthctl(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + if (!vcpu_el2_e2h_is_set(vcpu)) + val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10; + + return val & ((CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN) << 10); +} + +static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu) +{ + if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) +{ + if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu) +{ + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV)); +} + +static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2); + + if (!vcpu_el2_e2h_is_set(vcpu)) + val = translate_cptr_el2_to_cpacr_el1(val); + + if (val & CPACR_EL1_TTA) + return BEHAVE_FORWARD_RW; + + return BEHAVE_HANDLE_LOCALLY; +} + +static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + unsigned int idx; + + + switch (sysreg) { + case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30): + case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30): + idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg); + break; + case SYS_PMXEVTYPER_EL0: + case SYS_PMXEVCNTR_EL0: + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); + break; + default: + /* Someone used this trap helper for something else... */ + KVM_BUG_ON(1, vcpu->kvm); + return BEHAVE_HANDLE_LOCALLY; + } + + if (kvm_pmu_counter_is_hyp(vcpu, idx)) + return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0; + + return BEHAVE_HANDLE_LOCALLY; +} + +#define CCC(id, fn) \ + [id - __COMPLEX_CONDITIONS__] = fn + +static const complex_condition_check ccc[] = { + CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), + CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), + CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct), + CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct), + CCC(CGT_CPTR_TTA, check_cptr_tta), + CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), +}; + +/* + * Bit assignment for the trap controls. We use a 64bit word with the + * following layout for each trapped sysreg: + * + * [9:0] enum cgt_group_id (10 bits) + * [13:10] enum fgt_group_id (4 bits) + * [19:14] bit number in the FGT register (6 bits) + * [20] trap polarity (1 bit) + * [25:21] FG filter (5 bits) + * [35:26] Main SysReg table index (10 bits) + * [62:36] Unused (27 bits) + * [63] RES0 - Must be zero, as lost on insertion in the xarray + */ +#define TC_CGT_BITS 10 +#define TC_FGT_BITS 4 +#define TC_FGF_BITS 5 +#define TC_SRI_BITS 10 + +union trap_config { + u64 val; + struct { + unsigned long cgt:TC_CGT_BITS; /* Coarse Grained Trap id */ + unsigned long fgt:TC_FGT_BITS; /* Fine Grained Trap id */ + unsigned long bit:6; /* Bit number */ + unsigned long pol:1; /* Polarity */ + unsigned long fgf:TC_FGF_BITS; /* Fine Grained Filter */ + unsigned long sri:TC_SRI_BITS; /* SysReg Index */ + unsigned long unused:27; /* Unused, should be zero */ + unsigned long mbz:1; /* Must Be Zero */ + }; +}; + +struct encoding_to_trap_config { + const u32 encoding; + const u32 end; + const union trap_config tc; + const unsigned int line; +}; + +/* + * WARNING: using ranges is a treacherous endeavour, as sysregs that + * are part of an architectural range are not necessarily contiguous + * in the [Op0,Op1,CRn,CRm,Ops] space. Tread carefully. + */ +#define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \ + { \ + .encoding = sr_start, \ + .end = sr_end, \ + .tc = { \ + .cgt = trap_id, \ + }, \ + .line = __LINE__, \ + } + +#define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id) + +/* + * Map encoding to trap bits for exception reported with EC=0x18. + * These must only be evaluated when running a nested hypervisor, but + * that the current context is not a hypervisor context. When the + * trapped access matches one of the trap controls, the exception is + * re-injected in the nested hypervisor. + */ +static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { + SR_TRAP(SYS_REVIDR_EL1, CGT_HCR_TID1), + SR_TRAP(SYS_AIDR_EL1, CGT_HCR_TID1), + SR_TRAP(SYS_SMIDR_EL1, CGT_HCR_TID1), + SR_TRAP(SYS_CTR_EL0, CGT_HCR_TID2), + SR_TRAP(SYS_CCSIDR_EL1, CGT_HCR_TID2_TID4), + SR_TRAP(SYS_CCSIDR2_EL1, CGT_HCR_TID2_TID4), + SR_TRAP(SYS_CLIDR_EL1, CGT_HCR_TID2_TID4), + SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), + SR_RANGE_TRAP(SYS_ID_PFR0_EL1, + sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), + SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0), + sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0), + sys_reg(3, 1, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 2, 11, 0, 0), + sys_reg(3, 2, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 3, 11, 0, 0), + sys_reg(3, 3, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 4, 11, 0, 0), + sys_reg(3, 4, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 5, 11, 0, 0), + sys_reg(3, 5, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 6, 11, 0, 0), + sys_reg(3, 6, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 7, 11, 0, 0), + sys_reg(3, 7, 11, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 0, 15, 0, 0), + sys_reg(3, 0, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 1, 15, 0, 0), + sys_reg(3, 1, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 2, 15, 0, 0), + sys_reg(3, 2, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 3, 15, 0, 0), + sys_reg(3, 3, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 4, 15, 0, 0), + sys_reg(3, 4, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 5, 15, 0, 0), + sys_reg(3, 5, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 6, 15, 0, 0), + sys_reg(3, 6, 15, 15, 7), CGT_HCR_TIDCP), + SR_RANGE_TRAP(sys_reg(3, 7, 15, 0, 0), + sys_reg(3, 7, 15, 15, 7), CGT_HCR_TIDCP), + SR_TRAP(SYS_ACTLR_EL1, CGT_HCR_TACR), + SR_TRAP(SYS_DC_ISW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CISW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_IGSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_IGDSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CGSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CGDSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CIGSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CIGDSW, CGT_HCR_TSW), + SR_TRAP(SYS_DC_CIVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CVAP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CVADP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_IVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CIGVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CIGDVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_IGVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_IGDVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGDVAC, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGVAP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGDVAP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGVADP, CGT_HCR_TPC), + SR_TRAP(SYS_DC_CGDVADP, CGT_HCR_TPC), + SR_TRAP(SYS_IC_IVAU, CGT_HCR_TPU_TOCU), + SR_TRAP(SYS_IC_IALLU, CGT_HCR_TPU_TOCU), + SR_TRAP(SYS_IC_IALLUIS, CGT_HCR_TPU_TICAB), + SR_TRAP(SYS_DC_CVAU, CGT_HCR_TPU_TOCU), + SR_TRAP(OP_TLBI_RVAE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAAE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVALE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAALE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VMALLE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_ASIDE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAAE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VALE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAALE1, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAAE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVALE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAALE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VMALLE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_ASIDE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAAE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VALE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_VAALE1NXS, CGT_HCR_TTLB), + SR_TRAP(OP_TLBI_RVAE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAAE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVALE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAALE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VMALLE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_ASIDE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAAE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VALE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAALE1IS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAAE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVALE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_RVAALE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VMALLE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_ASIDE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAAE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VALE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VAALE1ISNXS, CGT_HCR_TTLB_TTLBIS), + SR_TRAP(OP_TLBI_VMALLE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_ASIDE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAAE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VALE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAALE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAAE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVALE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAALE1OS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VMALLE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_ASIDE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAAE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VALE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_VAALE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAAE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVALE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(OP_TLBI_RVAALE1OSNXS, CGT_HCR_TTLB_TTLBOS), + SR_TRAP(SYS_SCTLR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_SCTLR2_EL1, CGT_HCR_TVM_TRVM_HCRX_SCTLR2En), + SR_TRAP(SYS_TTBR0_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_TTBR1_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_TCR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_ESR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_FAR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_AFSR0_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_AFSR1_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIRE0_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL0, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En), + SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ), + SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ), + SR_TRAP(SYS_DC_GZVA, CGT_HCR_TDZ), + SR_TRAP(SYS_LORSA_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_LOREA_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_LORN_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_LORC_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_LORID_EL1, CGT_HCR_TLOR), + SR_TRAP(SYS_ERRIDR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERRSELR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXADDR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXCTLR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXFR_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXMISC0_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXMISC1_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXMISC2_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXMISC3_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_ERXSTATUS_EL1, CGT_HCR_TERR), + SR_TRAP(SYS_APIAKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APIAKEYHI_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APIBKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APIBKEYHI_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APDAKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APDAKEYHI_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APDBKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APDBKEYHI_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APGAKEYLO_EL1, CGT_HCR_APK), + SR_TRAP(SYS_APGAKEYHI_EL1, CGT_HCR_APK), + /* All _EL2 registers */ + SR_TRAP(SYS_BRBCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VPIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VMPIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SCTLR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ACTLR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SCTLR2_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_HCR_EL2, + SYS_HCRX_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SMPRIMAP_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SMCR_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_TTBR0_EL2, + SYS_TCR2_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VTTBR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VTCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VNCR_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_HDFGRTR_EL2, + SYS_HAFGRTR_EL2, CGT_HCR_NV), + /* Skip the SP_EL1 encoding... */ + SR_TRAP(SYS_SPSR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ELR_EL2, CGT_HCR_NV), + /* Skip SPSR_irq, SPSR_abt, SPSR_und, SPSR_fiq */ + SR_TRAP(SYS_AFSR0_EL2, CGT_HCR_NV), + SR_TRAP(SYS_AFSR1_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ESR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VSESR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_TFSR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_FAR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_HPFAR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_PMSCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MAIR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_AMAIR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MPAMHCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MPAMVPMV_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MPAM2_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_MPAMVPM0_EL2, + SYS_MPAMVPM7_EL2, CGT_HCR_NV), + /* + * Note that the spec. describes a group of MEC registers + * whose access should not trap, therefore skip the following: + * MECID_A0_EL2, MECID_A1_EL2, MECID_P0_EL2, + * MECID_P1_EL2, MECIDR_EL2, VMECID_A_EL2, + * VMECID_P_EL2. + */ + SR_RANGE_TRAP(SYS_VBAR_EL2, + SYS_RMR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VDISR_EL2, CGT_HCR_NV), + /* ICH_AP0R<m>_EL2 */ + SR_RANGE_TRAP(SYS_ICH_AP0R0_EL2, + SYS_ICH_AP0R3_EL2, CGT_HCR_NV), + /* ICH_AP1R<m>_EL2 */ + SR_RANGE_TRAP(SYS_ICH_AP1R0_EL2, + SYS_ICH_AP1R3_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ICC_SRE_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_ICH_HCR_EL2, + SYS_ICH_EISR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ICH_ELRSR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ICH_VMCR_EL2, CGT_HCR_NV), + /* ICH_LR<m>_EL2 */ + SR_RANGE_TRAP(SYS_ICH_LR0_EL2, + SYS_ICH_LR15_EL2, CGT_HCR_NV), + SR_TRAP(SYS_CONTEXTIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_TPIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SCXTNUM_EL2, CGT_HCR_NV), + /* AMEVCNTVOFF0<n>_EL2, AMEVCNTVOFF1<n>_EL2 */ + SR_RANGE_TRAP(SYS_AMEVCNTVOFF0n_EL2(0), + SYS_AMEVCNTVOFF1n_EL2(15), CGT_HCR_NV), + /* CNT*_EL2 */ + SR_TRAP(SYS_CNTVOFF_EL2, CGT_HCR_NV), + SR_TRAP(SYS_CNTPOFF_EL2, CGT_HCR_NV), + SR_TRAP(SYS_CNTHCTL_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_CNTHP_TVAL_EL2, + SYS_CNTHP_CVAL_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2, + SYS_CNTHV_CVAL_EL2, CGT_HCR_NV), + /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/ + SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0), + sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV), + SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0), + sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV), + SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT), + SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT), + SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV), + SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV), + SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV), + SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV), + SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV), + SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV), + SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1NXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1IS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2ISNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1ISNXS,CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2OS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VAE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_ALLE1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VALE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_VMALLS12E1OSNXS,CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2E1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2E1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_IPAS2LE1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RIPAS2LE1OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVAE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_TLBI_RVALE2OSNXS, CGT_HCR_NV), + SR_TRAP(OP_CPP_RCTX, CGT_HCR_NV), + SR_TRAP(OP_DVP_RCTX, CGT_HCR_NV), + SR_TRAP(OP_CFP_RCTX, CGT_HCR_NV), + SR_TRAP(SYS_SP_EL1, CGT_HCR_NV_nNV2), + SR_TRAP(SYS_VBAR_EL1, CGT_HCR_NV1_nNV2), + SR_TRAP(SYS_ELR_EL1, CGT_HCR_NV1_nNV2), + SR_TRAP(SYS_SPSR_EL1, CGT_HCR_NV1_nNV2), + SR_TRAP(SYS_SCXTNUM_EL1, CGT_HCR_NV1_nNV2_ENSCXT), + SR_TRAP(SYS_SCXTNUM_EL0, CGT_HCR_ENSCXT), + SR_TRAP(OP_AT_S1E1R, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1W, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E0R, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT), + SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN), + SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN), + SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN), + SR_TRAP(SYS_PMCR_EL0, CGT_MDCR_TPM_TPMCR), + SR_TRAP(SYS_PMCNTENSET_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMCNTENCLR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMOVSSET_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM), + SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM), + SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM), + SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_OSDTRRX_EL1, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_OSDTRTX_EL1, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_DBGDTR_EL0, CGT_MDCR_TDCC_TDE_TDA), + /* + * Also covers DBGDTRRX_EL0, which has the same encoding as + * SYS_DBGDTRTX_EL0... + */ + SR_TRAP(SYS_DBGDTRTX_EL0, CGT_MDCR_TDCC_TDE_TDA), + SR_TRAP(SYS_MDSCR_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_OSECCR_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(0), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(1), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(2), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(3), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(4), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(5), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(6), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(7), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(8), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(9), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(10), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(11), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(12), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(13), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(14), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBVRn_EL1(15), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(0), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(1), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(2), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(3), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(4), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(5), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(6), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(7), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(8), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(9), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(10), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(11), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(12), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(13), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(14), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGBCRn_EL1(15), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(0), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(1), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(2), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(3), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(4), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(5), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(6), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(7), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(8), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(9), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(10), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(11), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(12), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(13), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(14), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWVRn_EL1(15), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(0), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(1), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(2), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(3), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(4), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(5), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(6), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(7), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(8), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(9), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(10), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(11), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(12), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(13), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(14), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGCLAIMSET_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGCLAIMCLR_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGAUTHSTATUS_EL1, CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_OSLAR_EL1, CGT_MDCR_TDE_TDOSA), + SR_TRAP(SYS_OSLSR_EL1, CGT_MDCR_TDE_TDOSA), + SR_TRAP(SYS_OSDLR_EL1, CGT_MDCR_TDE_TDOSA), + SR_TRAP(SYS_DBGPRCR_EL1, CGT_MDCR_TDE_TDOSA), + SR_TRAP(SYS_MDRAR_EL1, CGT_MDCR_TDE_TDRA), + SR_TRAP(SYS_PMBLIMITR_EL1, CGT_MDCR_E2PB), + SR_TRAP(SYS_PMBPTR_EL1, CGT_MDCR_E2PB), + SR_TRAP(SYS_PMBSR_EL1, CGT_MDCR_E2PB), + SR_TRAP(SYS_PMSCR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSEVFR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSFCR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSICR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSIDR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSIRR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSLATFR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSNEVFR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSDSFR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_TRFCR_EL1, CGT_MDCR_TTRF), + SR_TRAP(SYS_TRBBASER_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBLIMITR_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBMAR_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBPTR_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBSR_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_TRBTRG_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_CPACR_EL1, CGT_CPTR_TCPAC), + SR_TRAP(SYS_AMUSERENR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCFGR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCGCR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENCLR0_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENCLR1_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENSET0_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENSET1_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(4), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(5), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(6), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(7), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(8), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(9), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(10), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(11), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(12), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(13), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(14), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(15), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(4), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(5), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(6), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(7), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(8), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(9), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(10), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(11), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(12), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM), + /* op0=2, op1=1, and CRn<0b1000 */ + SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0), + sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA), + SR_TRAP(SYS_CNTP_TVAL_EL0, CGT_CNTHCTL_EL1PTEN), + SR_TRAP(SYS_CNTP_CVAL_EL0, CGT_CNTHCTL_EL1PTEN), + SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), + SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT), + SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT), + SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), + /* + * IMPDEF choice: + * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as + * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for + * ICC_SRE_EL1 access, and always handle it locally. + */ + SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR), + SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC), +}; + +static DEFINE_XARRAY(sr_forward_xa); + +enum fg_filter_id { + __NO_FGF__, + HCRX_FGTnXS, + + /* Must be last */ + __NR_FG_FILTER_IDS__ +}; + +#define __FGT(g, b, p, f) \ + { \ + .fgt = g ## _GROUP, \ + .bit = g ## _EL2_ ## b ## _SHIFT, \ + .pol = p, \ + .fgf = f, \ + } + +#define FGT(g, b, p) __FGT(g, b, p, __NO_FGF__) + +/* + * See the warning next to SR_RANGE_TRAP(), and apply the same + * level of caution. + */ +#define SR_FGF_RANGE(sr, e, g, b, p, f) \ + { \ + .encoding = sr, \ + .end = e, \ + .tc = __FGT(g, b, p, f), \ + .line = __LINE__, \ + } + +#define SR_FGF(sr, g, b, p, f) SR_FGF_RANGE(sr, sr, g, b, p, f) +#define SR_FGT(sr, g, b, p) SR_FGF_RANGE(sr, sr, g, b, p, __NO_FGF__) +#define SR_FGT_RANGE(sr, end, g, b, p) \ + SR_FGF_RANGE(sr, end, g, b, p, __NO_FGF__) + +static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { + /* HFGRTR_EL2, HFGWTR_EL2 */ + SR_FGT(SYS_AMAIR2_EL1, HFGRTR, nAMAIR2_EL1, 0), + SR_FGT(SYS_MAIR2_EL1, HFGRTR, nMAIR2_EL1, 0), + SR_FGT(SYS_S2POR_EL1, HFGRTR, nS2POR_EL1, 0), + SR_FGT(SYS_POR_EL1, HFGRTR, nPOR_EL1, 0), + SR_FGT(SYS_POR_EL0, HFGRTR, nPOR_EL0, 0), + SR_FGT(SYS_PIR_EL1, HFGRTR, nPIR_EL1, 0), + SR_FGT(SYS_PIRE0_EL1, HFGRTR, nPIRE0_EL1, 0), + SR_FGT(SYS_RCWMASK_EL1, HFGRTR, nRCWMASK_EL1, 0), + SR_FGT(SYS_TPIDR2_EL0, HFGRTR, nTPIDR2_EL0, 0), + SR_FGT(SYS_SMPRI_EL1, HFGRTR, nSMPRI_EL1, 0), + SR_FGT(SYS_GCSCR_EL1, HFGRTR, nGCS_EL1, 0), + SR_FGT(SYS_GCSPR_EL1, HFGRTR, nGCS_EL1, 0), + SR_FGT(SYS_GCSCRE0_EL1, HFGRTR, nGCS_EL0, 0), + SR_FGT(SYS_GCSPR_EL0, HFGRTR, nGCS_EL0, 0), + SR_FGT(SYS_ACCDATA_EL1, HFGRTR, nACCDATA_EL1, 0), + SR_FGT(SYS_ERXADDR_EL1, HFGRTR, ERXADDR_EL1, 1), + SR_FGT(SYS_ERXPFGCDN_EL1, HFGRTR, ERXPFGCDN_EL1, 1), + SR_FGT(SYS_ERXPFGCTL_EL1, HFGRTR, ERXPFGCTL_EL1, 1), + SR_FGT(SYS_ERXPFGF_EL1, HFGRTR, ERXPFGF_EL1, 1), + SR_FGT(SYS_ERXMISC0_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXMISC1_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXMISC2_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXMISC3_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXSTATUS_EL1, HFGRTR, ERXSTATUS_EL1, 1), + SR_FGT(SYS_ERXCTLR_EL1, HFGRTR, ERXCTLR_EL1, 1), + SR_FGT(SYS_ERXFR_EL1, HFGRTR, ERXFR_EL1, 1), + SR_FGT(SYS_ERRSELR_EL1, HFGRTR, ERRSELR_EL1, 1), + SR_FGT(SYS_ERRIDR_EL1, HFGRTR, ERRIDR_EL1, 1), + SR_FGT(SYS_ICC_IGRPEN0_EL1, HFGRTR, ICC_IGRPENn_EL1, 1), + SR_FGT(SYS_ICC_IGRPEN1_EL1, HFGRTR, ICC_IGRPENn_EL1, 1), + SR_FGT(SYS_VBAR_EL1, HFGRTR, VBAR_EL1, 1), + SR_FGT(SYS_TTBR1_EL1, HFGRTR, TTBR1_EL1, 1), + SR_FGT(SYS_TTBR0_EL1, HFGRTR, TTBR0_EL1, 1), + SR_FGT(SYS_TPIDR_EL0, HFGRTR, TPIDR_EL0, 1), + SR_FGT(SYS_TPIDRRO_EL0, HFGRTR, TPIDRRO_EL0, 1), + SR_FGT(SYS_TPIDR_EL1, HFGRTR, TPIDR_EL1, 1), + SR_FGT(SYS_TCR_EL1, HFGRTR, TCR_EL1, 1), + SR_FGT(SYS_TCR2_EL1, HFGRTR, TCR_EL1, 1), + SR_FGT(SYS_SCXTNUM_EL0, HFGRTR, SCXTNUM_EL0, 1), + SR_FGT(SYS_SCXTNUM_EL1, HFGRTR, SCXTNUM_EL1, 1), + SR_FGT(SYS_SCTLR_EL1, HFGRTR, SCTLR_EL1, 1), + SR_FGT(SYS_SCTLR2_EL1, HFGRTR, SCTLR_EL1, 1), + SR_FGT(SYS_REVIDR_EL1, HFGRTR, REVIDR_EL1, 1), + SR_FGT(SYS_PAR_EL1, HFGRTR, PAR_EL1, 1), + SR_FGT(SYS_MPIDR_EL1, HFGRTR, MPIDR_EL1, 1), + SR_FGT(SYS_MIDR_EL1, HFGRTR, MIDR_EL1, 1), + SR_FGT(SYS_MAIR_EL1, HFGRTR, MAIR_EL1, 1), + SR_FGT(SYS_LORSA_EL1, HFGRTR, LORSA_EL1, 1), + SR_FGT(SYS_LORN_EL1, HFGRTR, LORN_EL1, 1), + SR_FGT(SYS_LORID_EL1, HFGRTR, LORID_EL1, 1), + SR_FGT(SYS_LOREA_EL1, HFGRTR, LOREA_EL1, 1), + SR_FGT(SYS_LORC_EL1, HFGRTR, LORC_EL1, 1), + SR_FGT(SYS_ISR_EL1, HFGRTR, ISR_EL1, 1), + SR_FGT(SYS_FAR_EL1, HFGRTR, FAR_EL1, 1), + SR_FGT(SYS_ESR_EL1, HFGRTR, ESR_EL1, 1), + SR_FGT(SYS_DCZID_EL0, HFGRTR, DCZID_EL0, 1), + SR_FGT(SYS_CTR_EL0, HFGRTR, CTR_EL0, 1), + SR_FGT(SYS_CSSELR_EL1, HFGRTR, CSSELR_EL1, 1), + SR_FGT(SYS_CPACR_EL1, HFGRTR, CPACR_EL1, 1), + SR_FGT(SYS_CONTEXTIDR_EL1, HFGRTR, CONTEXTIDR_EL1, 1), + SR_FGT(SYS_CLIDR_EL1, HFGRTR, CLIDR_EL1, 1), + SR_FGT(SYS_CCSIDR_EL1, HFGRTR, CCSIDR_EL1, 1), + SR_FGT(SYS_APIBKEYLO_EL1, HFGRTR, APIBKey, 1), + SR_FGT(SYS_APIBKEYHI_EL1, HFGRTR, APIBKey, 1), + SR_FGT(SYS_APIAKEYLO_EL1, HFGRTR, APIAKey, 1), + SR_FGT(SYS_APIAKEYHI_EL1, HFGRTR, APIAKey, 1), + SR_FGT(SYS_APGAKEYLO_EL1, HFGRTR, APGAKey, 1), + SR_FGT(SYS_APGAKEYHI_EL1, HFGRTR, APGAKey, 1), + SR_FGT(SYS_APDBKEYLO_EL1, HFGRTR, APDBKey, 1), + SR_FGT(SYS_APDBKEYHI_EL1, HFGRTR, APDBKey, 1), + SR_FGT(SYS_APDAKEYLO_EL1, HFGRTR, APDAKey, 1), + SR_FGT(SYS_APDAKEYHI_EL1, HFGRTR, APDAKey, 1), + SR_FGT(SYS_AMAIR_EL1, HFGRTR, AMAIR_EL1, 1), + SR_FGT(SYS_AIDR_EL1, HFGRTR, AIDR_EL1, 1), + SR_FGT(SYS_AFSR1_EL1, HFGRTR, AFSR1_EL1, 1), + SR_FGT(SYS_AFSR0_EL1, HFGRTR, AFSR0_EL1, 1), + + /* HFGRTR2_EL2, HFGWTR2_EL2 */ + SR_FGT(SYS_ACTLRALIAS_EL1, HFGRTR2, nACTLRALIAS_EL1, 0), + SR_FGT(SYS_ACTLRMASK_EL1, HFGRTR2, nACTLRMASK_EL1, 0), + SR_FGT(SYS_CPACRALIAS_EL1, HFGRTR2, nCPACRALIAS_EL1, 0), + SR_FGT(SYS_CPACRMASK_EL1, HFGRTR2, nCPACRMASK_EL1, 0), + SR_FGT(SYS_PFAR_EL1, HFGRTR2, nPFAR_EL1, 0), + SR_FGT(SYS_RCWSMASK_EL1, HFGRTR2, nRCWSMASK_EL1, 0), + SR_FGT(SYS_SCTLR2ALIAS_EL1, HFGRTR2, nSCTLRALIAS2_EL1, 0), + SR_FGT(SYS_SCTLR2MASK_EL1, HFGRTR2, nSCTLR2MASK_EL1, 0), + SR_FGT(SYS_SCTLRALIAS_EL1, HFGRTR2, nSCTLRALIAS_EL1, 0), + SR_FGT(SYS_SCTLRMASK_EL1, HFGRTR2, nSCTLRMASK_EL1, 0), + SR_FGT(SYS_TCR2ALIAS_EL1, HFGRTR2, nTCR2ALIAS_EL1, 0), + SR_FGT(SYS_TCR2MASK_EL1, HFGRTR2, nTCR2MASK_EL1, 0), + SR_FGT(SYS_TCRALIAS_EL1, HFGRTR2, nTCRALIAS_EL1, 0), + SR_FGT(SYS_TCRMASK_EL1, HFGRTR2, nTCRMASK_EL1, 0), + SR_FGT(SYS_ERXGSR_EL1, HFGRTR2, nERXGSR_EL1, 0), + + /* HFGITR_EL2 */ + SR_FGT(OP_AT_S1E1A, HFGITR, ATS1E1A, 1), + SR_FGT(OP_COSP_RCTX, HFGITR, COSPRCTX, 1), + SR_FGT(OP_GCSPUSHX, HFGITR, nGCSEPP, 0), + SR_FGT(OP_GCSPOPX, HFGITR, nGCSEPP, 0), + SR_FGT(OP_GCSPUSHM, HFGITR, nGCSPUSHM_EL1, 0), + SR_FGT(OP_BRB_IALL, HFGITR, nBRBIALL, 0), + SR_FGT(OP_BRB_INJ, HFGITR, nBRBINJ, 0), + SR_FGT(SYS_DC_CVAC, HFGITR, DCCVAC, 1), + SR_FGT(SYS_DC_CGVAC, HFGITR, DCCVAC, 1), + SR_FGT(SYS_DC_CGDVAC, HFGITR, DCCVAC, 1), + SR_FGT(OP_CPP_RCTX, HFGITR, CPPRCTX, 1), + SR_FGT(OP_DVP_RCTX, HFGITR, DVPRCTX, 1), + SR_FGT(OP_CFP_RCTX, HFGITR, CFPRCTX, 1), + SR_FGT(OP_TLBI_VAALE1, HFGITR, TLBIVAALE1, 1), + SR_FGT(OP_TLBI_VALE1, HFGITR, TLBIVALE1, 1), + SR_FGT(OP_TLBI_VAAE1, HFGITR, TLBIVAAE1, 1), + SR_FGT(OP_TLBI_ASIDE1, HFGITR, TLBIASIDE1, 1), + SR_FGT(OP_TLBI_VAE1, HFGITR, TLBIVAE1, 1), + SR_FGT(OP_TLBI_VMALLE1, HFGITR, TLBIVMALLE1, 1), + SR_FGT(OP_TLBI_RVAALE1, HFGITR, TLBIRVAALE1, 1), + SR_FGT(OP_TLBI_RVALE1, HFGITR, TLBIRVALE1, 1), + SR_FGT(OP_TLBI_RVAAE1, HFGITR, TLBIRVAAE1, 1), + SR_FGT(OP_TLBI_RVAE1, HFGITR, TLBIRVAE1, 1), + SR_FGT(OP_TLBI_RVAALE1IS, HFGITR, TLBIRVAALE1IS, 1), + SR_FGT(OP_TLBI_RVALE1IS, HFGITR, TLBIRVALE1IS, 1), + SR_FGT(OP_TLBI_RVAAE1IS, HFGITR, TLBIRVAAE1IS, 1), + SR_FGT(OP_TLBI_RVAE1IS, HFGITR, TLBIRVAE1IS, 1), + SR_FGT(OP_TLBI_VAALE1IS, HFGITR, TLBIVAALE1IS, 1), + SR_FGT(OP_TLBI_VALE1IS, HFGITR, TLBIVALE1IS, 1), + SR_FGT(OP_TLBI_VAAE1IS, HFGITR, TLBIVAAE1IS, 1), + SR_FGT(OP_TLBI_ASIDE1IS, HFGITR, TLBIASIDE1IS, 1), + SR_FGT(OP_TLBI_VAE1IS, HFGITR, TLBIVAE1IS, 1), + SR_FGT(OP_TLBI_VMALLE1IS, HFGITR, TLBIVMALLE1IS, 1), + SR_FGT(OP_TLBI_RVAALE1OS, HFGITR, TLBIRVAALE1OS, 1), + SR_FGT(OP_TLBI_RVALE1OS, HFGITR, TLBIRVALE1OS, 1), + SR_FGT(OP_TLBI_RVAAE1OS, HFGITR, TLBIRVAAE1OS, 1), + SR_FGT(OP_TLBI_RVAE1OS, HFGITR, TLBIRVAE1OS, 1), + SR_FGT(OP_TLBI_VAALE1OS, HFGITR, TLBIVAALE1OS, 1), + SR_FGT(OP_TLBI_VALE1OS, HFGITR, TLBIVALE1OS, 1), + SR_FGT(OP_TLBI_VAAE1OS, HFGITR, TLBIVAAE1OS, 1), + SR_FGT(OP_TLBI_ASIDE1OS, HFGITR, TLBIASIDE1OS, 1), + SR_FGT(OP_TLBI_VAE1OS, HFGITR, TLBIVAE1OS, 1), + SR_FGT(OP_TLBI_VMALLE1OS, HFGITR, TLBIVMALLE1OS, 1), + /* nXS variants must be checked against HCRX_EL2.FGTnXS */ + SR_FGF(OP_TLBI_VAALE1NXS, HFGITR, TLBIVAALE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VALE1NXS, HFGITR, TLBIVALE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAAE1NXS, HFGITR, TLBIVAAE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_ASIDE1NXS, HFGITR, TLBIASIDE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAE1NXS, HFGITR, TLBIVAE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VMALLE1NXS, HFGITR, TLBIVMALLE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAALE1NXS, HFGITR, TLBIRVAALE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVALE1NXS, HFGITR, TLBIRVALE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAAE1NXS, HFGITR, TLBIRVAAE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAE1NXS, HFGITR, TLBIRVAE1, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAALE1ISNXS, HFGITR, TLBIRVAALE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVALE1ISNXS, HFGITR, TLBIRVALE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAAE1ISNXS, HFGITR, TLBIRVAAE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAE1ISNXS, HFGITR, TLBIRVAE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAALE1ISNXS, HFGITR, TLBIVAALE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VALE1ISNXS, HFGITR, TLBIVALE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAAE1ISNXS, HFGITR, TLBIVAAE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_ASIDE1ISNXS, HFGITR, TLBIASIDE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAE1ISNXS, HFGITR, TLBIVAE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VMALLE1ISNXS, HFGITR, TLBIVMALLE1IS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAALE1OSNXS, HFGITR, TLBIRVAALE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVALE1OSNXS, HFGITR, TLBIRVALE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAAE1OSNXS, HFGITR, TLBIRVAAE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_RVAE1OSNXS, HFGITR, TLBIRVAE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAALE1OSNXS, HFGITR, TLBIVAALE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VALE1OSNXS, HFGITR, TLBIVALE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAAE1OSNXS, HFGITR, TLBIVAAE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_ASIDE1OSNXS, HFGITR, TLBIASIDE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VAE1OSNXS, HFGITR, TLBIVAE1OS, 1, HCRX_FGTnXS), + SR_FGF(OP_TLBI_VMALLE1OSNXS, HFGITR, TLBIVMALLE1OS, 1, HCRX_FGTnXS), + SR_FGT(OP_AT_S1E1WP, HFGITR, ATS1E1WP, 1), + SR_FGT(OP_AT_S1E1RP, HFGITR, ATS1E1RP, 1), + SR_FGT(OP_AT_S1E0W, HFGITR, ATS1E0W, 1), + SR_FGT(OP_AT_S1E0R, HFGITR, ATS1E0R, 1), + SR_FGT(OP_AT_S1E1W, HFGITR, ATS1E1W, 1), + SR_FGT(OP_AT_S1E1R, HFGITR, ATS1E1R, 1), + SR_FGT(SYS_DC_ZVA, HFGITR, DCZVA, 1), + SR_FGT(SYS_DC_GVA, HFGITR, DCZVA, 1), + SR_FGT(SYS_DC_GZVA, HFGITR, DCZVA, 1), + SR_FGT(SYS_DC_CIVAC, HFGITR, DCCIVAC, 1), + SR_FGT(SYS_DC_CIGVAC, HFGITR, DCCIVAC, 1), + SR_FGT(SYS_DC_CIGDVAC, HFGITR, DCCIVAC, 1), + SR_FGT(SYS_DC_CVADP, HFGITR, DCCVADP, 1), + SR_FGT(SYS_DC_CGVADP, HFGITR, DCCVADP, 1), + SR_FGT(SYS_DC_CGDVADP, HFGITR, DCCVADP, 1), + SR_FGT(SYS_DC_CVAP, HFGITR, DCCVAP, 1), + SR_FGT(SYS_DC_CGVAP, HFGITR, DCCVAP, 1), + SR_FGT(SYS_DC_CGDVAP, HFGITR, DCCVAP, 1), + SR_FGT(SYS_DC_CVAU, HFGITR, DCCVAU, 1), + SR_FGT(SYS_DC_CISW, HFGITR, DCCISW, 1), + SR_FGT(SYS_DC_CIGSW, HFGITR, DCCISW, 1), + SR_FGT(SYS_DC_CIGDSW, HFGITR, DCCISW, 1), + SR_FGT(SYS_DC_CSW, HFGITR, DCCSW, 1), + SR_FGT(SYS_DC_CGSW, HFGITR, DCCSW, 1), + SR_FGT(SYS_DC_CGDSW, HFGITR, DCCSW, 1), + SR_FGT(SYS_DC_ISW, HFGITR, DCISW, 1), + SR_FGT(SYS_DC_IGSW, HFGITR, DCISW, 1), + SR_FGT(SYS_DC_IGDSW, HFGITR, DCISW, 1), + SR_FGT(SYS_DC_IVAC, HFGITR, DCIVAC, 1), + SR_FGT(SYS_DC_IGVAC, HFGITR, DCIVAC, 1), + SR_FGT(SYS_DC_IGDVAC, HFGITR, DCIVAC, 1), + SR_FGT(SYS_IC_IVAU, HFGITR, ICIVAU, 1), + SR_FGT(SYS_IC_IALLU, HFGITR, ICIALLU, 1), + SR_FGT(SYS_IC_IALLUIS, HFGITR, ICIALLUIS, 1), + + /* HFGITR2_EL2 */ + SR_FGT(SYS_DC_CIGDVAPS, HFGITR2, nDCCIVAPS, 0), + SR_FGT(SYS_DC_CIVAPS, HFGITR2, nDCCIVAPS, 0), + + /* HDFGRTR_EL2 */ + SR_FGT(SYS_PMBIDR_EL1, HDFGRTR, PMBIDR_EL1, 1), + SR_FGT(SYS_PMSNEVFR_EL1, HDFGRTR, nPMSNEVFR_EL1, 0), + SR_FGT(SYS_BRBINF_EL1(0), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(1), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(2), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(3), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(4), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(5), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(6), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(7), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(8), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(9), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(10), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(11), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(12), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(13), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(14), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(15), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(16), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(17), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(18), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(19), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(20), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(21), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(22), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(23), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(24), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(25), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(26), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(27), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(28), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(29), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(30), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINF_EL1(31), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBINFINJ_EL1, HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(0), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(1), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(2), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(3), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(4), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(5), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(6), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(7), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(8), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(9), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(10), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(11), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(12), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(13), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(14), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(15), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(16), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(17), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(18), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(19), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(20), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(21), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(22), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(23), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(24), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(25), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(26), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(27), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(28), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(29), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(30), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRC_EL1(31), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBSRCINJ_EL1, HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(0), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(1), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(2), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(3), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(4), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(5), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(6), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(7), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(8), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(9), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(10), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(11), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(12), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(13), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(14), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(15), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(16), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(17), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(18), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(19), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(20), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(21), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(22), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(23), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(24), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(25), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(26), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(27), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(28), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(29), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(30), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGT_EL1(31), HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTGTINJ_EL1, HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBTS_EL1, HDFGRTR, nBRBDATA, 0), + SR_FGT(SYS_BRBCR_EL1, HDFGRTR, nBRBCTL, 0), + SR_FGT(SYS_BRBFCR_EL1, HDFGRTR, nBRBCTL, 0), + SR_FGT(SYS_BRBIDR0_EL1, HDFGRTR, nBRBIDR, 0), + SR_FGT(SYS_PMCEID0_EL0, HDFGRTR, PMCEIDn_EL0, 1), + SR_FGT(SYS_PMCEID1_EL0, HDFGRTR, PMCEIDn_EL0, 1), + SR_FGT(SYS_PMUSERENR_EL0, HDFGRTR, PMUSERENR_EL0, 1), + SR_FGT(SYS_TRBTRG_EL1, HDFGRTR, TRBTRG_EL1, 1), + SR_FGT(SYS_TRBSR_EL1, HDFGRTR, TRBSR_EL1, 1), + SR_FGT(SYS_TRBPTR_EL1, HDFGRTR, TRBPTR_EL1, 1), + SR_FGT(SYS_TRBMAR_EL1, HDFGRTR, TRBMAR_EL1, 1), + SR_FGT(SYS_TRBLIMITR_EL1, HDFGRTR, TRBLIMITR_EL1, 1), + SR_FGT(SYS_TRBIDR_EL1, HDFGRTR, TRBIDR_EL1, 1), + SR_FGT(SYS_TRBBASER_EL1, HDFGRTR, TRBBASER_EL1, 1), + SR_FGT(SYS_TRCVICTLR, HDFGRTR, TRCVICTLR, 1), + SR_FGT(SYS_TRCSTATR, HDFGRTR, TRCSTATR, 1), + SR_FGT(SYS_TRCSSCSR(0), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(1), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(2), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(3), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(4), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(5), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(6), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSSCSR(7), HDFGRTR, TRCSSCSRn, 1), + SR_FGT(SYS_TRCSEQSTR, HDFGRTR, TRCSEQSTR, 1), + SR_FGT(SYS_TRCPRGCTLR, HDFGRTR, TRCPRGCTLR, 1), + SR_FGT(SYS_TRCOSLSR, HDFGRTR, TRCOSLSR, 1), + SR_FGT(SYS_TRCIMSPEC(0), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(1), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(2), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(3), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(4), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(5), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(6), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCIMSPEC(7), HDFGRTR, TRCIMSPECn, 1), + SR_FGT(SYS_TRCDEVARCH, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCDEVID, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR0, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR1, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR2, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR3, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR4, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR5, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR6, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR7, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR8, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR9, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR10, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR11, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR12, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCIDR13, HDFGRTR, TRCID, 1), + SR_FGT(SYS_TRCCNTVR(0), HDFGRTR, TRCCNTVRn, 1), + SR_FGT(SYS_TRCCNTVR(1), HDFGRTR, TRCCNTVRn, 1), + SR_FGT(SYS_TRCCNTVR(2), HDFGRTR, TRCCNTVRn, 1), + SR_FGT(SYS_TRCCNTVR(3), HDFGRTR, TRCCNTVRn, 1), + SR_FGT(SYS_TRCCLAIMCLR, HDFGRTR, TRCCLAIM, 1), + SR_FGT(SYS_TRCCLAIMSET, HDFGRTR, TRCCLAIM, 1), + SR_FGT(SYS_TRCAUXCTLR, HDFGRTR, TRCAUXCTLR, 1), + SR_FGT(SYS_TRCAUTHSTATUS, HDFGRTR, TRCAUTHSTATUS, 1), + SR_FGT(SYS_TRCACATR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(8), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(9), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(10), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(11), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(12), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(13), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(14), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACATR(15), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(8), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(9), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(10), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(11), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(12), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(13), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(14), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCACVR(15), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCBBCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCCCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCCTLR0, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCCTLR1, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCIDCVR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTCTLR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTCTLR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTCTLR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTCTLR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTRLDVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTRLDVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTRLDVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCNTRLDVR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCCONFIGR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEVENTCTL0R, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEVENTCTL1R, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEXTINSELR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEXTINSELR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEXTINSELR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCEXTINSELR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCQCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(8), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(9), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(10), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(11), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(12), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(13), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(14), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(15), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(16), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(17), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(18), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(19), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(20), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(21), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(22), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(23), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(24), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(25), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(26), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(27), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(28), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(29), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(30), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSCTLR(31), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCRSR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSEQEVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSEQEVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSEQEVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSEQRSTEVR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSCCR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSSPCICR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSTALLCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCSYNCPR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCTRACEIDR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCTSCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVIIECTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVIPCSSCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVISSCTLR, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCCTLR0, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCCTLR1, HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(0), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(1), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(2), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(3), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(4), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(5), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(6), HDFGRTR, TRC, 1), + SR_FGT(SYS_TRCVMIDCVR(7), HDFGRTR, TRC, 1), + SR_FGT(SYS_PMSLATFR_EL1, HDFGRTR, PMSLATFR_EL1, 1), + SR_FGT(SYS_PMSIRR_EL1, HDFGRTR, PMSIRR_EL1, 1), + SR_FGT(SYS_PMSIDR_EL1, HDFGRTR, PMSIDR_EL1, 1), + SR_FGT(SYS_PMSICR_EL1, HDFGRTR, PMSICR_EL1, 1), + SR_FGT(SYS_PMSFCR_EL1, HDFGRTR, PMSFCR_EL1, 1), + SR_FGT(SYS_PMSEVFR_EL1, HDFGRTR, PMSEVFR_EL1, 1), + SR_FGT(SYS_PMSCR_EL1, HDFGRTR, PMSCR_EL1, 1), + SR_FGT(SYS_PMBSR_EL1, HDFGRTR, PMBSR_EL1, 1), + SR_FGT(SYS_PMBPTR_EL1, HDFGRTR, PMBPTR_EL1, 1), + SR_FGT(SYS_PMBLIMITR_EL1, HDFGRTR, PMBLIMITR_EL1, 1), + SR_FGT(SYS_PMMIR_EL1, HDFGRTR, PMMIR_EL1, 1), + SR_FGT(SYS_PMSELR_EL0, HDFGRTR, PMSELR_EL0, 1), + SR_FGT(SYS_PMOVSCLR_EL0, HDFGRTR, PMOVS, 1), + SR_FGT(SYS_PMOVSSET_EL0, HDFGRTR, PMOVS, 1), + SR_FGT(SYS_PMINTENCLR_EL1, HDFGRTR, PMINTEN, 1), + SR_FGT(SYS_PMINTENSET_EL1, HDFGRTR, PMINTEN, 1), + SR_FGT(SYS_PMCNTENCLR_EL0, HDFGRTR, PMCNTEN, 1), + SR_FGT(SYS_PMCNTENSET_EL0, HDFGRTR, PMCNTEN, 1), + SR_FGT(SYS_PMCCNTR_EL0, HDFGRTR, PMCCNTR_EL0, 1), + SR_FGT(SYS_PMCCFILTR_EL0, HDFGRTR, PMCCFILTR_EL0, 1), + SR_FGT_RANGE(SYS_PMEVTYPERn_EL0(0), + SYS_PMEVTYPERn_EL0(30), + HDFGRTR, PMEVTYPERn_EL0, 1), + SR_FGT_RANGE(SYS_PMEVCNTRn_EL0(0), + SYS_PMEVCNTRn_EL0(30), + HDFGRTR, PMEVCNTRn_EL0, 1), + SR_FGT(SYS_OSDLR_EL1, HDFGRTR, OSDLR_EL1, 1), + SR_FGT(SYS_OSECCR_EL1, HDFGRTR, OSECCR_EL1, 1), + SR_FGT(SYS_OSLSR_EL1, HDFGRTR, OSLSR_EL1, 1), + SR_FGT(SYS_DBGPRCR_EL1, HDFGRTR, DBGPRCR_EL1, 1), + SR_FGT(SYS_DBGAUTHSTATUS_EL1, HDFGRTR, DBGAUTHSTATUS_EL1, 1), + SR_FGT(SYS_DBGCLAIMSET_EL1, HDFGRTR, DBGCLAIM, 1), + SR_FGT(SYS_DBGCLAIMCLR_EL1, HDFGRTR, DBGCLAIM, 1), + SR_FGT(SYS_MDSCR_EL1, HDFGRTR, MDSCR_EL1, 1), + /* + * The trap bits capture *64* debug registers per bit, but the + * ARM ARM only describes the encoding for the first 16, and + * we don't really support more than that anyway. + */ + SR_FGT(SYS_DBGWVRn_EL1(0), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(1), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(2), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(3), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(4), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(5), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(6), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(7), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(8), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(9), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(10), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(11), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(12), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(13), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(14), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWVRn_EL1(15), HDFGRTR, DBGWVRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(0), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(1), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(2), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(3), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(4), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(5), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(6), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(7), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(8), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(9), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(10), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(11), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(12), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(13), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(14), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGWCRn_EL1(15), HDFGRTR, DBGWCRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(0), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(1), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(2), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(3), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(4), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(5), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(6), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(7), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(8), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(9), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(10), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(11), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(12), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(13), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(14), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBVRn_EL1(15), HDFGRTR, DBGBVRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(0), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(1), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(2), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(3), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(4), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(5), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(6), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(7), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(8), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(9), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(10), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(11), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(12), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(13), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(14), HDFGRTR, DBGBCRn_EL1, 1), + SR_FGT(SYS_DBGBCRn_EL1(15), HDFGRTR, DBGBCRn_EL1, 1), + + /* HDFGRTR2_EL2 */ + SR_FGT(SYS_MDSELR_EL1, HDFGRTR2, nMDSELR_EL1, 0), + SR_FGT(SYS_MDSTEPOP_EL1, HDFGRTR2, nMDSTEPOP_EL1, 0), + SR_FGT(SYS_PMCCNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0), + SR_FGT_RANGE(SYS_PMEVCNTSVRn_EL1(0), + SYS_PMEVCNTSVRn_EL1(30), + HDFGRTR2, nPMSSDATA, 0), + SR_FGT(SYS_PMICNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0), + SR_FGT(SYS_PMECR_EL1, HDFGRTR2, nPMECR_EL1, 0), + SR_FGT(SYS_PMIAR_EL1, HDFGRTR2, nPMIAR_EL1, 0), + SR_FGT(SYS_PMICFILTR_EL0, HDFGRTR2, nPMICFILTR_EL0, 0), + SR_FGT(SYS_PMICNTR_EL0, HDFGRTR2, nPMICNTR_EL0, 0), + SR_FGT(SYS_PMSSCR_EL1, HDFGRTR2, nPMSSCR_EL1, 0), + SR_FGT(SYS_PMUACR_EL1, HDFGRTR2, nPMUACR_EL1, 0), + SR_FGT(SYS_SPMACCESSR_EL1, HDFGRTR2, nSPMACCESSR_EL1, 0), + SR_FGT(SYS_SPMCFGR_EL1, HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMDEVARCH_EL1, HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMCGCRn_EL1(0), HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMCGCRn_EL1(1), HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMIIDR_EL1, HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMCNTENCLR_EL0, HDFGRTR2, nSPMCNTEN, 0), + SR_FGT(SYS_SPMCNTENSET_EL0, HDFGRTR2, nSPMCNTEN, 0), + SR_FGT(SYS_SPMCR_EL0, HDFGRTR2, nSPMCR_EL0, 0), + SR_FGT(SYS_SPMDEVAFF_EL1, HDFGRTR2, nSPMDEVAFF_EL1, 0), + /* + * We have up to 64 of these registers in ranges of 16, banked via + * SPMSELR_EL0.BANK. We're only concerned with the accessors here, + * not the architectural registers. + */ + SR_FGT_RANGE(SYS_SPMEVCNTRn_EL0(0), + SYS_SPMEVCNTRn_EL0(15), + HDFGRTR2, nSPMEVCNTRn_EL0, 0), + SR_FGT_RANGE(SYS_SPMEVFILT2Rn_EL0(0), + SYS_SPMEVFILT2Rn_EL0(15), + HDFGRTR2, nSPMEVTYPERn_EL0, 0), + SR_FGT_RANGE(SYS_SPMEVFILTRn_EL0(0), + SYS_SPMEVFILTRn_EL0(15), + HDFGRTR2, nSPMEVTYPERn_EL0, 0), + SR_FGT_RANGE(SYS_SPMEVTYPERn_EL0(0), + SYS_SPMEVTYPERn_EL0(15), + HDFGRTR2, nSPMEVTYPERn_EL0, 0), + SR_FGT(SYS_SPMINTENCLR_EL1, HDFGRTR2, nSPMINTEN, 0), + SR_FGT(SYS_SPMINTENSET_EL1, HDFGRTR2, nSPMINTEN, 0), + SR_FGT(SYS_SPMOVSCLR_EL0, HDFGRTR2, nSPMOVS, 0), + SR_FGT(SYS_SPMOVSSET_EL0, HDFGRTR2, nSPMOVS, 0), + SR_FGT(SYS_SPMSCR_EL1, HDFGRTR2, nSPMSCR_EL1, 0), + SR_FGT(SYS_SPMSELR_EL0, HDFGRTR2, nSPMSELR_EL0, 0), + SR_FGT(SYS_TRCITECR_EL1, HDFGRTR2, nTRCITECR_EL1, 0), + SR_FGT(SYS_PMBMAR_EL1, HDFGRTR2, nPMBMAR_EL1, 0), + SR_FGT(SYS_PMSDSFR_EL1, HDFGRTR2, nPMSDSFR_EL1, 0), + SR_FGT(SYS_TRBMPAM_EL1, HDFGRTR2, nTRBMPAM_EL1, 0), + + /* + * HDFGWTR_EL2 + * + * Although HDFGRTR_EL2 and HDFGWTR_EL2 registers largely + * overlap in their bit assignment, there are a number of bits + * that are RES0 on one side, and an actual trap bit on the + * other. The policy chosen here is to describe all the + * read-side mappings, and only the write-side mappings that + * differ from the read side, and the trap handler will pick + * the correct shadow register based on the access type. + * + * Same model applies to the FEAT_FGT2 registers. + */ + SR_FGT(SYS_TRFCR_EL1, HDFGWTR, TRFCR_EL1, 1), + SR_FGT(SYS_TRCOSLAR, HDFGWTR, TRCOSLAR, 1), + SR_FGT(SYS_PMCR_EL0, HDFGWTR, PMCR_EL0, 1), + SR_FGT(SYS_PMSWINC_EL0, HDFGWTR, PMSWINC_EL0, 1), + SR_FGT(SYS_OSLAR_EL1, HDFGWTR, OSLAR_EL1, 1), + + /* HDFGWTR2_EL2 */ + SR_FGT(SYS_PMZR_EL0, HDFGWTR2, nPMZR_EL0, 0), + SR_FGT(SYS_SPMZR_EL0, HDFGWTR2, nSPMEVCNTRn_EL0, 0), + + /* + * HAFGRTR_EL2 + */ + SR_FGT(SYS_AMEVTYPER1_EL0(15), HAFGRTR, AMEVTYPER115_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(14), HAFGRTR, AMEVTYPER114_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(13), HAFGRTR, AMEVTYPER113_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(12), HAFGRTR, AMEVTYPER112_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(11), HAFGRTR, AMEVTYPER111_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(10), HAFGRTR, AMEVTYPER110_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(9), HAFGRTR, AMEVTYPER19_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(8), HAFGRTR, AMEVTYPER18_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(7), HAFGRTR, AMEVTYPER17_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(6), HAFGRTR, AMEVTYPER16_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(5), HAFGRTR, AMEVTYPER15_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(4), HAFGRTR, AMEVTYPER14_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(3), HAFGRTR, AMEVTYPER13_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(2), HAFGRTR, AMEVTYPER12_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(1), HAFGRTR, AMEVTYPER11_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(0), HAFGRTR, AMEVTYPER10_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(15), HAFGRTR, AMEVCNTR115_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(14), HAFGRTR, AMEVCNTR114_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(13), HAFGRTR, AMEVCNTR113_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(12), HAFGRTR, AMEVCNTR112_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(11), HAFGRTR, AMEVCNTR111_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(10), HAFGRTR, AMEVCNTR110_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(9), HAFGRTR, AMEVCNTR19_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(8), HAFGRTR, AMEVCNTR18_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(7), HAFGRTR, AMEVCNTR17_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(6), HAFGRTR, AMEVCNTR16_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(5), HAFGRTR, AMEVCNTR15_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(4), HAFGRTR, AMEVCNTR14_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(3), HAFGRTR, AMEVCNTR13_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(2), HAFGRTR, AMEVCNTR12_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(1), HAFGRTR, AMEVCNTR11_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(0), HAFGRTR, AMEVCNTR10_EL0, 1), + SR_FGT(SYS_AMCNTENCLR1_EL0, HAFGRTR, AMCNTEN1, 1), + SR_FGT(SYS_AMCNTENSET1_EL0, HAFGRTR, AMCNTEN1, 1), + SR_FGT(SYS_AMCNTENCLR0_EL0, HAFGRTR, AMCNTEN0, 1), + SR_FGT(SYS_AMCNTENSET0_EL0, HAFGRTR, AMCNTEN0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(3), HAFGRTR, AMEVCNTR03_EL0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(2), HAFGRTR, AMEVCNTR02_EL0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(1), HAFGRTR, AMEVCNTR01_EL0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1), +}; + +/* + * Additional FGTs that do not fire with ESR_EL2.EC==0x18. This table + * isn't used for exception routing, but only as a promise that the + * trap is handled somewhere else. + */ +static const union trap_config non_0x18_fgt[] __initconst = { + FGT(HFGITR, PSBCSYNC, 1), + FGT(HFGITR, nGCSSTR_EL1, 0), + FGT(HFGITR, SVC_EL1, 1), + FGT(HFGITR, SVC_EL0, 1), + FGT(HFGITR, ERET, 1), + FGT(HFGITR2, TSBCSYNC, 1), +}; + +static union trap_config get_trap_config(u32 sysreg) +{ + return (union trap_config) { + .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)), + }; +} + +static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc, + const char *type, int err) +{ + kvm_err("%s line %d encoding range " + "(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n", + type, tc->line, + sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding), + sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding), + sys_reg_Op2(tc->encoding), + sys_reg_Op0(tc->end), sys_reg_Op1(tc->end), + sys_reg_CRn(tc->end), sys_reg_CRm(tc->end), + sys_reg_Op2(tc->end), + err); +} + +static u32 encoding_next(u32 encoding) +{ + u8 op0, op1, crn, crm, op2; + + op0 = sys_reg_Op0(encoding); + op1 = sys_reg_Op1(encoding); + crn = sys_reg_CRn(encoding); + crm = sys_reg_CRm(encoding); + op2 = sys_reg_Op2(encoding); + + if (op2 < Op2_mask) + return sys_reg(op0, op1, crn, crm, op2 + 1); + if (crm < CRm_mask) + return sys_reg(op0, op1, crn, crm + 1, 0); + if (crn < CRn_mask) + return sys_reg(op0, op1, crn + 1, 0, 0); + if (op1 < Op1_mask) + return sys_reg(op0, op1 + 1, 0, 0, 0); + + return sys_reg(op0 + 1, 0, 0, 0, 0); +} + +#define FGT_MASKS(__n, __m) \ + struct fgt_masks __n = { .str = #__m, .res0 = __m, } + +FGT_MASKS(hfgrtr_masks, HFGRTR_EL2_RES0); +FGT_MASKS(hfgwtr_masks, HFGWTR_EL2_RES0); +FGT_MASKS(hfgitr_masks, HFGITR_EL2_RES0); +FGT_MASKS(hdfgrtr_masks, HDFGRTR_EL2_RES0); +FGT_MASKS(hdfgwtr_masks, HDFGWTR_EL2_RES0); +FGT_MASKS(hafgrtr_masks, HAFGRTR_EL2_RES0); +FGT_MASKS(hfgrtr2_masks, HFGRTR2_EL2_RES0); +FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2_RES0); +FGT_MASKS(hfgitr2_masks, HFGITR2_EL2_RES0); +FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2_RES0); +FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2_RES0); + +static __init bool aggregate_fgt(union trap_config tc) +{ + struct fgt_masks *rmasks, *wmasks; + + switch (tc.fgt) { + case HFGRTR_GROUP: + rmasks = &hfgrtr_masks; + wmasks = &hfgwtr_masks; + break; + case HDFGRTR_GROUP: + rmasks = &hdfgrtr_masks; + wmasks = &hdfgwtr_masks; + break; + case HAFGRTR_GROUP: + rmasks = &hafgrtr_masks; + wmasks = NULL; + break; + case HFGITR_GROUP: + rmasks = &hfgitr_masks; + wmasks = NULL; + break; + case HFGRTR2_GROUP: + rmasks = &hfgrtr2_masks; + wmasks = &hfgwtr2_masks; + break; + case HDFGRTR2_GROUP: + rmasks = &hdfgrtr2_masks; + wmasks = &hdfgwtr2_masks; + break; + case HFGITR2_GROUP: + rmasks = &hfgitr2_masks; + wmasks = NULL; + break; + } + + /* + * A bit can be reserved in either the R or W register, but + * not both. + */ + if ((BIT(tc.bit) & rmasks->res0) && + (!wmasks || (BIT(tc.bit) & wmasks->res0))) + return false; + + if (tc.pol) + rmasks->mask |= BIT(tc.bit) & ~rmasks->res0; + else + rmasks->nmask |= BIT(tc.bit) & ~rmasks->res0; + + if (wmasks) { + if (tc.pol) + wmasks->mask |= BIT(tc.bit) & ~wmasks->res0; + else + wmasks->nmask |= BIT(tc.bit) & ~wmasks->res0; + } + + return true; +} + +static __init int check_fgt_masks(struct fgt_masks *masks) +{ + unsigned long duplicate = masks->mask & masks->nmask; + u64 res0 = masks->res0; + int ret = 0; + + if (duplicate) { + int i; + + for_each_set_bit(i, &duplicate, 64) { + kvm_err("%s[%d] bit has both polarities\n", + masks->str, i); + } + + ret = -EINVAL; + } + + masks->res0 = ~(masks->mask | masks->nmask); + if (masks->res0 != res0) + kvm_info("Implicit %s = %016llx, expecting %016llx\n", + masks->str, masks->res0, res0); + + return ret; +} + +static __init int check_all_fgt_masks(int ret) +{ + static struct fgt_masks * const masks[] __initconst = { + &hfgrtr_masks, + &hfgwtr_masks, + &hfgitr_masks, + &hdfgrtr_masks, + &hdfgwtr_masks, + &hafgrtr_masks, + &hfgrtr2_masks, + &hfgwtr2_masks, + &hfgitr2_masks, + &hdfgrtr2_masks, + &hdfgwtr2_masks, + }; + int err = 0; + + for (int i = 0; i < ARRAY_SIZE(masks); i++) + err |= check_fgt_masks(masks[i]); + + return ret ?: err; +} + +#define for_each_encoding_in(__x, __s, __e) \ + for (u32 __x = __s; __x <= __e; __x = encoding_next(__x)) + +int __init populate_nv_trap_config(void) +{ + int ret = 0; + + BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *)); + BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS)); + BUILD_BUG_ON(__NR_FGT_GROUP_IDS__ > BIT(TC_FGT_BITS)); + BUILD_BUG_ON(__NR_FG_FILTER_IDS__ > BIT(TC_FGF_BITS)); + BUILD_BUG_ON(__HCRX_EL2_MASK & __HCRX_EL2_nMASK); + + for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) { + const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i]; + void *prev; + + if (cgt->tc.val & BIT(63)) { + kvm_err("CGT[%d] has MBZ bit set\n", i); + ret = -EINVAL; + } + + for_each_encoding_in(enc, cgt->encoding, cgt->end) { + prev = xa_store(&sr_forward_xa, enc, + xa_mk_value(cgt->tc.val), GFP_KERNEL); + if (prev && !xa_is_err(prev)) { + ret = -EINVAL; + print_nv_trap_error(cgt, "Duplicate CGT", ret); + } + + if (xa_is_err(prev)) { + ret = xa_err(prev); + print_nv_trap_error(cgt, "Failed CGT insertion", ret); + } + } + } + + if (__HCRX_EL2_RES0 != HCRX_EL2_RES0) + kvm_info("Sanitised HCR_EL2_RES0 = %016llx, expecting %016llx\n", + __HCRX_EL2_RES0, HCRX_EL2_RES0); + + kvm_info("nv: %ld coarse grained trap handlers\n", + ARRAY_SIZE(encoding_to_cgt)); + + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + goto check_mcb; + + for (int i = 0; i < ARRAY_SIZE(encoding_to_fgt); i++) { + const struct encoding_to_trap_config *fgt = &encoding_to_fgt[i]; + union trap_config tc; + void *prev; + + if (fgt->tc.fgt >= __NR_FGT_GROUP_IDS__) { + ret = -EINVAL; + print_nv_trap_error(fgt, "Invalid FGT", ret); + } + + for_each_encoding_in(enc, fgt->encoding, fgt->end) { + tc = get_trap_config(enc); + + if (tc.fgt) { + ret = -EINVAL; + print_nv_trap_error(fgt, "Duplicate FGT", ret); + } + + tc.val |= fgt->tc.val; + prev = xa_store(&sr_forward_xa, enc, + xa_mk_value(tc.val), GFP_KERNEL); + + if (xa_is_err(prev)) { + ret = xa_err(prev); + print_nv_trap_error(fgt, "Failed FGT insertion", ret); + } + + if (!aggregate_fgt(tc)) { + ret = -EINVAL; + print_nv_trap_error(fgt, "FGT bit is reserved", ret); + } + } + } + + for (int i = 0; i < ARRAY_SIZE(non_0x18_fgt); i++) { + if (!aggregate_fgt(non_0x18_fgt[i])) { + ret = -EINVAL; + kvm_err("non_0x18_fgt[%d] is reserved\n", i); + } + } + + ret = check_all_fgt_masks(ret); + + kvm_info("nv: %ld fine grained trap handlers\n", + ARRAY_SIZE(encoding_to_fgt)); + +check_mcb: + for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) { + const enum cgt_group_id *cgids; + + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; + + for (int i = 0; cgids[i] != __RESERVED__; i++) { + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ && + cgids[i] < __COMPLEX_CONDITIONS__) { + kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); + ret = -EINVAL; + } + } + } + + if (ret) + xa_destroy(&sr_forward_xa); + + return ret; +} + +int __init populate_sysreg_config(const struct sys_reg_desc *sr, + unsigned int idx) +{ + union trap_config tc; + u32 encoding; + void *ret; + + /* + * 0 is a valid value for the index, but not for the storage. + * We'll store (idx+1), so check against an offset'd limit. + */ + if (idx >= (BIT(TC_SRI_BITS) - 1)) { + kvm_err("sysreg %s (%d) out of range\n", sr->name, idx); + return -EINVAL; + } + + encoding = sys_reg(sr->Op0, sr->Op1, sr->CRn, sr->CRm, sr->Op2); + tc = get_trap_config(encoding); + + if (tc.sri) { + kvm_err("sysreg %s (%d) duplicate entry (%d)\n", + sr->name, idx - 1, tc.sri); + return -EINVAL; + } + + tc.sri = idx + 1; + ret = xa_store(&sr_forward_xa, encoding, + xa_mk_value(tc.val), GFP_KERNEL); + + return xa_err(ret); +} + +static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, + const struct trap_bits *tb) +{ + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; + u64 val; + + val = __vcpu_sys_reg(vcpu, tb->index); + if ((val & tb->mask) == tb->value) + b |= tb->behaviour; + + return b; +} + +static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu, + const enum cgt_group_id id, + enum trap_behaviour b) +{ + switch (id) { + const enum cgt_group_id *cgids; + + case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1: + if (likely(id != __RESERVED__)) + b |= get_behaviour(vcpu, &coarse_trap_bits[id]); + break; + case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1: + /* Yes, this is recursive. Don't do anything stupid. */ + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; + for (int i = 0; cgids[i] != __RESERVED__; i++) + b |= __compute_trap_behaviour(vcpu, cgids[i], b); + break; + default: + if (ARRAY_SIZE(ccc)) + b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu); + break; + } + + return b; +} + +static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, + const union trap_config tc) +{ + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; + + return __compute_trap_behaviour(vcpu, tc.cgt, b); +} + +static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) +{ + struct kvm_sysreg_masks *masks; + + /* Only handle the VNCR-backed regs for now */ + if (sr < __VNCR_START__) + return 0; + + masks = kvm->arch.sysreg_masks; + + return masks->mask[sr - __VNCR_START__].res0; +} + +static bool check_fgt_bit(struct kvm_vcpu *vcpu, enum vcpu_sysreg sr, + const union trap_config tc) +{ + struct kvm *kvm = vcpu->kvm; + u64 val; + + /* + * KVM doesn't know about any FGTs that apply to the host, and hopefully + * that'll remain the case. + */ + if (is_hyp_ctxt(vcpu)) + return false; + + val = __vcpu_sys_reg(vcpu, sr); + + if (tc.pol) + return (val & BIT(tc.bit)); + + /* + * FGTs with negative polarities are an absolute nightmare, as + * we need to evaluate the bit in the light of the feature + * that defines it. WTF were they thinking? + * + * So let's check if the bit has been earmarked as RES0, as + * this indicates an unimplemented feature. + */ + if (val & BIT(tc.bit)) + return false; + + return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit)); +} + +bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) +{ + enum vcpu_sysreg fgtreg; + union trap_config tc; + enum trap_behaviour b; + bool is_read; + u32 sysreg; + u64 esr; + + esr = kvm_vcpu_get_esr(vcpu); + sysreg = esr_sys64_to_sysreg(esr); + is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + + tc = get_trap_config(sysreg); + + /* + * A value of 0 for the whole entry means that we know nothing + * for this sysreg, and that it cannot be re-injected into the + * nested hypervisor. In this situation, let's cut it short. + */ + if (!tc.val) + goto local; + + /* + * If a sysreg can be trapped using a FGT, first check whether we + * trap for the purpose of forbidding the feature. In that case, + * inject an UNDEF. + */ + if (tc.fgt != __NO_FGT_GROUP__ && + (vcpu->kvm->arch.fgu[tc.fgt] & BIT(tc.bit))) { + kvm_inject_undefined(vcpu); + return true; + } + + /* + * If we're not nesting, immediately return to the caller, with the + * sysreg index, should we have it. + */ + if (!vcpu_has_nv(vcpu)) + goto local; + + /* + * There are a few traps that take effect InHost, but are constrained + * to EL0. Don't bother with computing the trap behaviour if the vCPU + * isn't in EL0. + */ + if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu)) + goto local; + + switch ((enum fgt_group_id)tc.fgt) { + case __NO_FGT_GROUP__: + break; + + case HFGRTR_GROUP: + fgtreg = is_read ? HFGRTR_EL2 : HFGWTR_EL2; + break; + + case HDFGRTR_GROUP: + fgtreg = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2; + break; + + case HAFGRTR_GROUP: + fgtreg = HAFGRTR_EL2; + break; + + case HFGITR_GROUP: + fgtreg = HFGITR_EL2; + switch (tc.fgf) { + u64 tmp; + + case __NO_FGF__: + break; + + case HCRX_FGTnXS: + tmp = __vcpu_sys_reg(vcpu, HCRX_EL2); + if (tmp & HCRX_EL2_FGTnXS) + tc.fgt = __NO_FGT_GROUP__; + } + break; + + case HFGRTR2_GROUP: + fgtreg = is_read ? HFGRTR2_EL2 : HFGWTR2_EL2; + break; + + case HDFGRTR2_GROUP: + fgtreg = is_read ? HDFGRTR2_EL2 : HDFGWTR2_EL2; + break; + + case HFGITR2_GROUP: + fgtreg = HFGITR2_EL2; + break; + + default: + /* Something is really wrong, bail out */ + WARN_ONCE(1, "Bad FGT group (encoding %08x, config %016llx)\n", + sysreg, tc.val); + goto local; + } + + if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, fgtreg, tc)) + goto inject; + + b = compute_trap_behaviour(vcpu, tc); + + if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu)) + goto local; + + if (((b & BEHAVE_FORWARD_READ) && is_read) || + ((b & BEHAVE_FORWARD_WRITE) && !is_read)) + goto inject; + +local: + if (!tc.sri) { + struct sys_reg_params params; + + params = esr_sys64_to_params(esr); + + /* + * Check for the IMPDEF range, as per DDI0487 J.a, + * D18.3.2 Reserved encodings for IMPLEMENTATION + * DEFINED registers. + */ + if (!(params.Op0 == 3 && (params.CRn & 0b1011) == 0b1011)) + print_sys_reg_msg(¶ms, + "Unsupported guest access at: %lx\n", + *vcpu_pc(vcpu)); + kvm_inject_undefined(vcpu); + return true; + } + + *sr_index = tc.sri - 1; + return false; + +inject: + trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read); + + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return true; +} + +static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit) +{ + if (is_nested_ctxt(vcpu) && + (__vcpu_sys_reg(vcpu, reg) & control_bit)) { + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return true; + } + return false; +} + +static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, HCR_EL2, control_bit); +} + +bool forward_smc_trap(struct kvm_vcpu *vcpu) +{ + return forward_hcr_traps(vcpu, HCR_TSC); +} + +static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, MDCR_EL2, control_bit); +} + +bool forward_debug_exception(struct kvm_vcpu *vcpu) +{ + return forward_mdcr_traps(vcpu, MDCR_EL2_TDE); +} + +static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) +{ + u64 mode = spsr & PSR_MODE_MASK; + + /* + * Possible causes for an Illegal Exception Return from EL2: + * - trying to return to EL3 + * - trying to return to an illegal M value + * - trying to return to a 32bit EL + * - trying to return to EL1 with HCR_EL2.TGE set + */ + if (mode == PSR_MODE_EL3t || mode == PSR_MODE_EL3h || + mode == 0b00001 || (mode & BIT(1)) || + (spsr & PSR_MODE32_BIT) || + (vcpu_el2_tge_is_set(vcpu) && (mode == PSR_MODE_EL1t || + mode == PSR_MODE_EL1h))) { + /* + * The guest is playing with our nerves. Preserve EL, SP, + * masks, flags from the existing PSTATE, and set IL. + * The HW will then generate an Illegal State Exception + * immediately after ERET. + */ + spsr = *vcpu_cpsr(vcpu); + + spsr &= (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | + PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT | + PSR_MODE_MASK | PSR_MODE32_BIT); + spsr |= PSR_IL_BIT; + } + + return spsr; +} + +void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) +{ + u64 spsr, elr, esr; + + spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); + spsr = kvm_check_illegal_exception_return(vcpu, spsr); + + /* Check for an ERETAx */ + esr = kvm_vcpu_get_esr(vcpu); + if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) { + /* + * Oh no, ERETAx failed to authenticate. + * + * If we have FPACCOMBINE and we don't have a pending + * Illegal Execution State exception (which has priority + * over FPAC), deliver an exception right away. + * + * Otherwise, let the mangled ELR value trickle down the + * ERET handling, and the guest will have a little surprise. + */ + if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) { + esr &= ESR_ELx_ERET_ISS_ERETA; + esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC); + kvm_inject_nested_sync(vcpu, esr); + return; + } + } + + preempt_disable(); + vcpu_set_flag(vcpu, IN_NESTED_ERET); + kvm_arch_vcpu_put(vcpu); + + if (!esr_iss_is_eretax(esr)) + elr = __vcpu_sys_reg(vcpu, ELR_EL2); + + trace_kvm_nested_eret(vcpu, elr, spsr); + + *vcpu_pc(vcpu) = elr; + *vcpu_cpsr(vcpu) = spsr; + + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + vcpu_clear_flag(vcpu, IN_NESTED_ERET); + preempt_enable(); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_nested_transition(vcpu); +} + +static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, + enum exception_type type) +{ + trace_kvm_inject_nested_exception(vcpu, esr_el2, type); + + switch (type) { + case except_type_sync: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC); + vcpu_write_sys_reg(vcpu, esr_el2, ESR_EL2); + break; + case except_type_irq: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ); + break; + case except_type_serror: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR); + break; + default: + WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type); + } +} + +/* + * Emulate taking an exception to EL2. + * See ARM ARM J8.1.2 AArch64.TakeException() + */ +static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, + enum exception_type type) +{ + u64 pstate, mode; + bool direct_inject; + + if (!vcpu_has_nv(vcpu)) { + kvm_err("Unexpected call to %s for the non-nesting configuration\n", + __func__); + return -EINVAL; + } + + /* + * As for ERET, we can avoid doing too much on the injection path by + * checking that we either took the exception from a VHE host + * userspace or from vEL2. In these cases, there is no change in + * translation regime (or anything else), so let's do as little as + * possible. + */ + pstate = *vcpu_cpsr(vcpu); + mode = pstate & (PSR_MODE_MASK | PSR_MODE32_BIT); + + direct_inject = (mode == PSR_MODE_EL0t && + vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)); + direct_inject |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t); + + if (direct_inject) { + kvm_inject_el2_exception(vcpu, esr_el2, type); + return 1; + } + + preempt_disable(); + + /* + * We may have an exception or PC update in the EL0/EL1 context. + * Commit it before entering EL2. + */ + __kvm_adjust_pc(vcpu); + + kvm_arch_vcpu_put(vcpu); + + kvm_inject_el2_exception(vcpu, esr_el2, type); + + /* + * A hard requirement is that a switch between EL1 and EL2 + * contexts has to happen between a put/load, so that we can + * pick the correct timer and interrupt configuration, among + * other things. + * + * Make sure the exception actually took place before we load + * the new context. + */ + __kvm_adjust_pc(vcpu); + + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_pmu_nested_transition(vcpu); + + return 1; +} + +int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2) +{ + return kvm_inject_nested(vcpu, esr_el2, except_type_sync); +} + +int kvm_inject_nested_irq(struct kvm_vcpu *vcpu) +{ + /* + * Do not inject an irq if the: + * - Current exception level is EL2, and + * - virtual HCR_EL2.TGE == 0 + * - virtual HCR_EL2.IMO == 0 + * + * See Table D1-17 "Physical interrupt target and masking when EL3 is + * not implemented and EL2 is implemented" in ARM DDI 0487C.a. + */ + + if (vcpu_is_el2(vcpu) && !vcpu_el2_tge_is_set(vcpu) && + !(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_IMO)) + return 1; + + /* esr_el2 value doesn't matter for exits due to irqs. */ + return kvm_inject_nested(vcpu, 0, except_type_irq); +} + +int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr) +{ + u64 esr = FIELD_PREP(ESR_ELx_EC_MASK, + iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW); + esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL; + + vcpu_write_sys_reg(vcpu, addr, FAR_EL2); + + if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE) + return kvm_inject_nested(vcpu, esr, except_type_serror); + + return kvm_inject_nested_sync(vcpu, esr); +} + +int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr) +{ + /* + * Hardware sets up the EC field when propagating ESR as a result of + * vSError injection. Manually populate EC for an emulated SError + * exception. + */ + esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR); + return kvm_inject_nested(vcpu, esr, except_type_serror); +} diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 02dd7e9ebd39..15e17aca1dec 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -14,57 +14,6 @@ #include <asm/kvm_mmu.h> #include <asm/sysreg.h> -void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu) -{ - struct task_struct *p = vcpu->arch.parent_task; - struct user_fpsimd_state *fpsimd; - - if (!is_protected_kvm_enabled() || !p) - return; - - fpsimd = &p->thread.uw.fpsimd_state; - kvm_unshare_hyp(fpsimd, fpsimd + 1); - put_task_struct(p); -} - -/* - * Called on entry to KVM_RUN unless this vcpu previously ran at least - * once and the most recent prior KVM_RUN for this vcpu was called from - * the same task as current (highly likely). - * - * This is guaranteed to execute before kvm_arch_vcpu_load_fp(vcpu), - * such that on entering hyp the relevant parts of current are already - * mapped. - */ -int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu) -{ - int ret; - - struct user_fpsimd_state *fpsimd = ¤t->thread.uw.fpsimd_state; - - kvm_vcpu_unshare_task_fp(vcpu); - - /* Make sure the host task fpsimd state is visible to hyp: */ - ret = kvm_share_hyp(fpsimd, fpsimd + 1); - if (ret) - return ret; - - vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd); - - /* - * We need to keep current's task_struct pinned until its data has been - * unshared with the hypervisor to make sure it is not re-used by the - * kernel and donated to someone else while already shared -- see - * kvm_vcpu_unshare_task_fp() for the matching put_task_struct(). - */ - if (is_protected_kvm_enabled()) { - get_task_struct(current); - vcpu->arch.parent_task = current; - } - - return 0; -} - /* * Prepare vcpu for saving the host's FPSIMD state and loading the guest's. * The actual loading is done by the FPSIMD access trap taken to hyp. @@ -79,37 +28,22 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) if (!system_supports_fpsimd()) return; - fpsimd_kvm_prepare(); - - vcpu->arch.fp_state = FP_STATE_HOST_OWNED; - - vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - vcpu_set_flag(vcpu, HOST_SVE_ENABLED); - /* - * We don't currently support SME guests but if we leave - * things in streaming mode then when the guest starts running - * FPSIMD or SVE code it may generate SME traps so as a - * special case if we are in streaming mode we force the host - * state to be saved now and exit streaming mode so that we - * don't have to handle any SME traps for valid guest - * operations. Do this for ZA as well for now for simplicity. + * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such + * that the host kernel is responsible for restoring this state upon + * return to userspace, and the hyp code doesn't need to save anything. + * + * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures + * that PSTATE.{SM,ZA} == {0,0}. */ - if (system_supports_sme()) { - vcpu_clear_flag(vcpu, HOST_SME_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) - vcpu_set_flag(vcpu, HOST_SME_ENABLED); + fpsimd_save_and_flush_cpu_state(); + *host_data_ptr(fp_owner) = FP_STATE_FREE; - if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { - vcpu->arch.fp_state = FP_STATE_FREE; - fpsimd_save_and_flush_cpu_state(); - } - } + WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR)); } /* - * Called just before entering the guest once we are no longer preemptable + * Called just before entering the guest once we are no longer preemptible * and interrupts are disabled. If we have managed to run anything using * FP while we were preemptible (such as off the back of an interrupt), * then neither the host nor the guest own the FP hardware (and it was the @@ -118,7 +52,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) { if (test_thread_flag(TIF_FOREIGN_FPSTATE)) - vcpu->arch.fp_state = FP_STATE_FREE; + *host_data_ptr(fp_owner) = FP_STATE_FREE; } /* @@ -134,8 +68,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) WARN_ON_ONCE(!irqs_disabled()); - if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { - + if (guest_owns_fp_regs()) { /* * Currently we do not support SME guests so SVCR is * always 0 and we just need a variable to point to. @@ -143,8 +76,9 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) fp_state.st = &vcpu->arch.ctxt.fp_regs; fp_state.sve_state = vcpu->arch.sve_state; fp_state.sve_vl = vcpu->arch.sve_max_vl; - fp_state.za_state = NULL; - fp_state.svcr = &vcpu->arch.svcr; + fp_state.sme_state = NULL; + fp_state.svcr = __ctxt_sys_reg(&vcpu->arch.ctxt, SVCR); + fp_state.fpmr = __ctxt_sys_reg(&vcpu->arch.ctxt, FPMR); fp_state.fp_type = &vcpu->arch.fp_type; if (vcpu_has_sve(vcpu)) @@ -170,45 +104,18 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); - /* - * If we have VHE then the Hyp code will reset CPACR_EL1 to - * CPACR_EL1_DEFAULT and we need to reenable SME. - */ - if (has_vhe() && system_supports_sme()) { - /* Also restore EL0 state seen on entry */ - if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, - CPACR_EL1_SMEN_EL0EN | - CPACR_EL1_SMEN_EL1EN); - else - sysreg_clear_set(CPACR_EL1, - CPACR_EL1_SMEN_EL0EN, - CPACR_EL1_SMEN_EL1EN); - } - - if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { - if (vcpu_has_sve(vcpu)) { - __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); - - /* Restore the VL that was saved when bound to the CPU */ - if (!has_vhe()) - sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, - SYS_ZCR_EL1); - } - - fpsimd_save_and_flush_cpu_state(); - } else if (has_vhe() && system_supports_sve()) { + if (guest_owns_fp_regs()) { /* - * The FPSIMD/SVE state in the CPU has not been touched, and we - * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been - * reset to CPACR_EL1_DEFAULT by the Hyp code, disabling SVE - * for EL0. To avoid spurious traps, restore the trap state - * seen by kvm_arch_vcpu_load_fp(): + * Flush (save and invalidate) the fpsimd/sve state so that if + * the host tries to use fpsimd/sve, it's not using stale data + * from the guest. + * + * Flushing the state sets the TIF_FOREIGN_FPSTATE bit for the + * context unconditionally, in both nVHE and VHE. This allows + * the kernel to restore the fpsimd/sve state, including ZCR_EL1 + * when needed. */ - if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); - else - sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); + fpsimd_save_and_flush_cpu_state(); } local_irq_restore(flags); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index cf4c495a4321..1c87699fd886 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -24,6 +24,7 @@ #include <asm/fpsimd.h> #include <asm/kvm.h> #include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> #include <asm/sigcontext.h> #include "trace.h" @@ -250,9 +251,15 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case PSR_AA32_MODE_SVC: case PSR_AA32_MODE_ABT: case PSR_AA32_MODE_UND: + case PSR_AA32_MODE_SYS: if (!vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; + case PSR_MODE_EL2h: + case PSR_MODE_EL2t: + if (!vcpu_has_nv(vcpu)) + return -EINVAL; + fallthrough; case PSR_MODE_EL0t: case PSR_MODE_EL1t: case PSR_MODE_EL1h: @@ -270,7 +277,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { int i, nr_reg; - switch (*vcpu_cpsr(vcpu)) { + switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) { /* * Either we are dealing with user mode, and only the * first 15 registers (+ PC) must be narrowed to 32bit. @@ -584,59 +591,6 @@ static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) return copy_core_reg_indices(vcpu, NULL); } -/** - * ARM64 versions of the TIMER registers, always available on arm64 - */ - -#define NUM_TIMER_REGS 3 - -static bool is_timer_reg(u64 index) -{ - switch (index) { - case KVM_REG_ARM_TIMER_CTL: - case KVM_REG_ARM_TIMER_CNT: - case KVM_REG_ARM_TIMER_CVAL: - return true; - } - return false; -} - -static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - if (put_user(KVM_REG_ARM_TIMER_CTL, uindices)) - return -EFAULT; - uindices++; - if (put_user(KVM_REG_ARM_TIMER_CNT, uindices)) - return -EFAULT; - uindices++; - if (put_user(KVM_REG_ARM_TIMER_CVAL, uindices)) - return -EFAULT; - - return 0; -} - -static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - int ret; - - ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); - if (ret != 0) - return -EFAULT; - - return kvm_arm_timer_set_reg(vcpu, reg->id, val); -} - -static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - - val = kvm_arm_timer_get_reg(vcpu, reg->id); - return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0; -} - static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) { const unsigned int slices = vcpu_sve_slices(vcpu); @@ -700,6 +654,7 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, /** * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG + * @vcpu: the vCPU pointer * * This is for all registers. */ @@ -711,13 +666,14 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) res += num_sve_regs(vcpu); res += kvm_arm_num_sys_reg_descs(vcpu); res += kvm_arm_get_fw_num_regs(vcpu); - res += NUM_TIMER_REGS; return res; } /** * kvm_arm_copy_reg_indices - get indices of all registers. + * @vcpu: the vCPU pointer + * @uindices: register list to copy * * We do core registers right here, then we append system regs. */ @@ -740,11 +696,6 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return ret; uindices += kvm_arm_get_fw_num_regs(vcpu); - ret = copy_timer_indices(vcpu, uindices); - if (ret < 0) - return ret; - uindices += NUM_TIMER_REGS; - return kvm_arm_copy_sys_reg_indices(vcpu, uindices); } @@ -762,9 +713,6 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg); } - if (is_timer_reg(reg->id)) - return get_timer_reg(vcpu, reg); - return kvm_arm_sys_reg_get_reg(vcpu, reg); } @@ -782,9 +730,6 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg); } - if (is_timer_reg(reg->id)) - return set_timer_reg(vcpu, reg); - return kvm_arm_sys_reg_set_reg(vcpu, reg); } @@ -803,8 +748,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - events->exception.serror_pending = !!(vcpu->arch.hcr_el2 & HCR_VSE); - events->exception.serror_has_esr = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); + events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); + events->exception.serror_pending = (vcpu->arch.hcr_el2 & HCR_VSE) || + vcpu_get_flag(vcpu, NESTED_SERROR_PENDING); if (events->exception.serror_pending && events->exception.serror_has_esr) events->exception.serror_esr = vcpu_get_vsesr(vcpu); @@ -818,29 +764,62 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, return 0; } +static void commit_pending_events(struct kvm_vcpu *vcpu) +{ + if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) + return; + + /* + * Reset the MMIO emulation state to avoid stepping PC after emulating + * the exception entry. + */ + vcpu->mmio_needed = false; + kvm_call_hyp(__kvm_adjust_pc, vcpu); +} + int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { bool serror_pending = events->exception.serror_pending; bool has_esr = events->exception.serror_has_esr; bool ext_dabt_pending = events->exception.ext_dabt_pending; + u64 esr = events->exception.serror_esr; + int ret = 0; - if (serror_pending && has_esr) { - if (!cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) - return -EINVAL; - - if (!((events->exception.serror_esr) & ~ESR_ELx_ISS_MASK)) - kvm_set_sei_esr(vcpu, events->exception.serror_esr); - else - return -EINVAL; - } else if (serror_pending) { - kvm_inject_vabt(vcpu); + /* + * Immediately commit the pending SEA to the vCPU's architectural + * state which is necessary since we do not return a pending SEA + * to userspace via KVM_GET_VCPU_EVENTS. + */ + if (ext_dabt_pending) { + ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + commit_pending_events(vcpu); } - if (ext_dabt_pending) - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + if (ret < 0) + return ret; + + if (!serror_pending) + return 0; - return 0; + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && has_esr) + return -EINVAL; + + if (has_esr && (esr & ~ESR_ELx_ISS_MASK)) + return -EINVAL; + + if (has_esr) + ret = kvm_inject_serror_esr(vcpu, esr); + else + ret = kvm_inject_serror(vcpu); + + /* + * We could've decided that the SError is due for immediate software + * injection; commit the exception in case userspace decides it wants + * to inject more exceptions for some strange reason. + */ + commit_pending_events(vcpu); + return (ret < 0) ? ret : 0; } u32 __attribute_const__ kvm_target_cpu(void) @@ -863,7 +842,7 @@ u32 __attribute_const__ kvm_target_cpu(void) break; case ARM_CPU_IMP_APM: switch (part_number) { - case APM_CPU_PART_POTENZA: + case APM_CPU_PART_XGENE: return KVM_ARM_TARGET_XGENE_POTENZA; } break; @@ -873,21 +852,6 @@ u32 __attribute_const__ kvm_target_cpu(void) return KVM_ARM_TARGET_GENERIC_V8; } -void kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) -{ - u32 target = kvm_target_cpu(); - - memset(init, 0, sizeof(*init)); - - /* - * For now, we don't return any features. - * In future, we might use features to return target - * specific features available for the preferred - * target type. - */ - init->target = (__u32)target; -} - int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; @@ -906,8 +870,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, /** * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging - * @kvm: pointer to the KVM struct - * @kvm_guest_debug: the ioctl data buffer + * @vcpu: the vCPU pointer + * @dbg: the ioctl data buffer * * This sets up and enables the VM for guest debugging. Userspace * passes in a control flag to enable different debug types and @@ -917,31 +881,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int ret = 0; - trace_kvm_set_guest_debug(vcpu, dbg->control); - if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) { - ret = -EINVAL; - goto out; - } - - if (dbg->control & KVM_GUESTDBG_ENABLE) { - vcpu->guest_debug = dbg->control; - - /* Hardware assisted Break and Watch points */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - vcpu->arch.external_debug_state = dbg->arch; - } + if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) + return -EINVAL; - } else { - /* If not enabled clear all flags */ + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->guest_debug = 0; - vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); + return 0; } -out: - return ret; + vcpu->guest_debug = dbg->control; + + /* Hardware assisted Break and Watch points */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + vcpu->arch.external_debug_state = dbg->arch; + + return 0; } int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, @@ -951,7 +908,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: + mutex_lock(&vcpu->kvm->arch.config_lock); ret = kvm_arm_pmu_v3_set_attr(vcpu, attr); + mutex_unlock(&vcpu->kvm->arch.config_lock); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_set_attr(vcpu, attr); @@ -1013,8 +972,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, return ret; } -long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, - struct kvm_arm_copy_mte_tags *copy_tags) +int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, + struct kvm_arm_copy_mte_tags *copy_tags) { gpa_t guest_ipa = copy_tags->guest_ipa; size_t length = copy_tags->length; @@ -1035,54 +994,72 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) return -EINVAL; + /* Lengths above INT_MAX cannot be represented in the return value */ + if (length > INT_MAX) + return -EINVAL; + gfn = gpa_to_gfn(guest_ipa); mutex_lock(&kvm->slots_lock); + if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) { + ret = -EBUSY; + goto out; + } + while (length > 0) { - kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); + struct page *page = __gfn_to_page(kvm, gfn, write); void *maddr; unsigned long num_tags; - struct page *page; + struct folio *folio; - if (is_error_noslot_pfn(pfn)) { + if (!page) { ret = -EFAULT; goto out; } - page = pfn_to_online_page(pfn); - if (!page) { + if (!pfn_to_online_page(page_to_pfn(page))) { /* Reject ZONE_DEVICE memory */ + kvm_release_page_unused(page); ret = -EFAULT; goto out; } + folio = page_folio(page); maddr = page_address(page); if (!write) { - if (page_mte_tagged(page)) + if ((folio_test_hugetlb(folio) && + folio_test_hugetlb_mte_tagged(folio)) || + page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else /* No tags in memory, so write zeros */ num_tags = MTE_GRANULES_PER_PAGE - clear_user(tags, MTE_GRANULES_PER_PAGE); - kvm_release_pfn_clean(pfn); + kvm_release_page_clean(page); } else { /* * Only locking to serialise with a concurrent - * set_pte_at() in the VMM but still overriding the + * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ - try_page_mte_tagging(page); + if (folio_test_hugetlb(folio)) + folio_try_hugetlb_mte_tagging(folio); + else + try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); /* uaccess failed, don't leave stale tags */ if (num_tags != MTE_GRANULES_PER_PAGE) mte_clear_page_tags(maddr); - set_page_mte_tagged(page); + if (folio_test_hugetlb(folio)) + folio_set_hugetlb_mte_tagged(folio); + else + set_page_mte_tagged(page); - kvm_release_pfn_dirty(pfn); + kvm_release_page_dirty(page); } if (num_tags != MTE_GRANULES_PER_PAGE) { diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index e778eefcf214..cc7d5d1709cb 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -10,12 +10,14 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/ubsan.h> #include <asm/esr.h> #include <asm/exception.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/debug-monitors.h> #include <asm/stacktrace/nvhe.h> #include <asm/traps.h> @@ -30,47 +32,82 @@ typedef int (*exit_handle_fn)(struct kvm_vcpu *); static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr) { if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr)) - kvm_inject_vabt(vcpu); + kvm_inject_serror(vcpu); } static int handle_hvc(struct kvm_vcpu *vcpu) { - int ret; - trace_kvm_hvc_arm64(*vcpu_pc(vcpu), vcpu_get_reg(vcpu, 0), kvm_vcpu_hvc_get_imm(vcpu)); vcpu->stat.hvc_exit_stat++; - ret = kvm_hvc_call_handler(vcpu); - if (ret < 0) { - vcpu_set_reg(vcpu, 0, ~0UL); + /* Forward hvc instructions to the virtual EL2 if the guest has EL2. */ + if (vcpu_has_nv(vcpu)) { + if (vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_HCD) + kvm_inject_undefined(vcpu); + else + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return 1; } - return ret; + return kvm_smccc_call_handler(vcpu); } static int handle_smc(struct kvm_vcpu *vcpu) { /* + * Forward this trapped smc instruction to the virtual EL2 if + * the guest has asked for it. + */ + if (forward_smc_trap(vcpu)) + return 1; + + /* * "If an SMC instruction executed at Non-secure EL1 is * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a * Trap exception, not a Secure Monitor Call exception [...]" * * We need to advance the PC after the trap, as it would - * otherwise return to the same address... + * otherwise return to the same address. Furthermore, pre-incrementing + * the PC before potentially exiting to userspace maintains the same + * abstraction for both SMCs and HVCs. */ - vcpu_set_reg(vcpu, 0, ~0UL); kvm_incr_pc(vcpu); - return 1; + + /* + * SMCs with a nonzero immediate are reserved according to DEN0028E 2.9 + * "SMC and HVC immediate value". + */ + if (kvm_vcpu_hvc_get_imm(vcpu)) { + vcpu_set_reg(vcpu, 0, ~0UL); + return 1; + } + + /* + * If imm is zero then it is likely an SMCCC call. + * + * Note that on ARMv8.3, even if EL3 is not implemented, SMC executed + * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than + * being treated as UNDEFINED. + */ + return kvm_smccc_call_handler(vcpu); } /* - * Guest access to FP/ASIMD registers are routed to this handler only - * when the system doesn't support FP/ASIMD. + * This handles the cases where the system does not support FP/ASIMD or when + * we are running nested virtualization and the guest hypervisor is trapping + * FP/ASIMD accesses by its guest guest. + * + * All other handling of guest vs. host FP/ASIMD register state is handled in + * fixup_guest_exit(). */ -static int handle_no_fpsimd(struct kvm_vcpu *vcpu) +static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu) { + if (guest_hyp_fpsimd_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + /* This is the case when the system doesn't support FP/ASIMD. */ kvm_inject_undefined(vcpu); return 1; } @@ -93,8 +130,12 @@ static int handle_no_fpsimd(struct kvm_vcpu *vcpu) static int kvm_handle_wfx(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); + bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE); - if (esr & ESR_ELx_WFx_ISS_WFE) { + if (guest_hyp_wfx_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + if (is_wfe) { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); vcpu->stat.wfe_exit_stat++; } else { @@ -106,7 +147,12 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) if (esr & ESR_ELx_WFx_ISS_RV) { u64 val, now; - now = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_TIMER_CNT); + now = kvm_phys_timer_read(); + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + now -= timer_get_offset(vcpu_hvtimer(vcpu)); + else + now -= timer_get_offset(vcpu_vtimer(vcpu)); + val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); if (now >= val) @@ -147,6 +193,9 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; u64 esr = kvm_vcpu_get_esr(vcpu); + if (!vcpu->guest_debug && forward_debug_exception(vcpu)) + return 1; + run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = lower_32_bits(esr); run->debug.arch.hsr_high = upper_32_bits(esr); @@ -157,7 +206,7 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) run->debug.arch.far = vcpu->arch.fault.far_el2; break; case ESR_ELx_EC_SOFTSTP_LOW: - vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; break; } @@ -181,21 +230,149 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) */ static int handle_sve(struct kvm_vcpu *vcpu) { + if (guest_hyp_sve_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + kvm_inject_undefined(vcpu); return 1; } /* - * Guest usage of a ptrauth instruction (which the guest EL1 did not turn into - * a NOP). If we get here, it is that we didn't fixup ptrauth on exit, and all - * that we can do is give the guest an UNDEF. + * Two possibilities to handle a trapping ptrauth instruction: + * + * - Guest usage of a ptrauth instruction (which the guest EL1 did not + * turn into a NOP). If we get here, it is because we didn't enable + * ptrauth for the guest. This results in an UNDEF, as it isn't + * supposed to use ptrauth without being told it could. + * + * - Running an L2 NV guest while L1 has left HCR_EL2.API==0, and for + * which we reinject the exception into L1. + * + * Anything else is an emulation bug (hence the WARN_ON + UNDEF). */ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu) { + if (!vcpu_has_ptrauth(vcpu)) { + kvm_inject_undefined(vcpu); + return 1; + } + + if (is_nested_ctxt(vcpu)) { + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return 1; + } + + /* Really shouldn't be here! */ + WARN_ON_ONCE(1); + kvm_inject_undefined(vcpu); + return 1; +} + +static int kvm_handle_eret(struct kvm_vcpu *vcpu) +{ + if (esr_iss_is_eretax(kvm_vcpu_get_esr(vcpu)) && + !vcpu_has_ptrauth(vcpu)) + return kvm_handle_ptrauth(vcpu); + + /* + * If we got here, two possibilities: + * + * - the guest is in EL2, and we need to fully emulate ERET + * + * - the guest is in EL1, and we need to reinject the + * exception into the L1 hypervisor. + * + * If KVM ever traps ERET for its own use, we'll have to + * revisit this. + */ + if (is_hyp_ctxt(vcpu)) + kvm_emulate_nested_eret(vcpu); + else + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + return 1; +} + +static int handle_svc(struct kvm_vcpu *vcpu) +{ + /* + * So far, SVC traps only for NV via HFGITR_EL2. A SVC from a + * 32bit guest would be caught by vpcu_mode_is_bad_32bit(), so + * we should only have to deal with a 64 bit exception. + */ + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return 1; +} + +static int kvm_handle_gcs(struct kvm_vcpu *vcpu) +{ + /* We don't expect GCS, so treat it with contempt */ + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, GCS, IMP)) + WARN_ON_ONCE(1); + kvm_inject_undefined(vcpu); return 1; } +static int handle_other(struct kvm_vcpu *vcpu) +{ + bool allowed, fwd = is_nested_ctxt(vcpu); + u64 hcrx = __vcpu_sys_reg(vcpu, HCRX_EL2); + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 iss = ESR_ELx_ISS(esr); + struct kvm *kvm = vcpu->kvm; + + /* + * We only trap for two reasons: + * + * - the feature is disabled, and the only outcome is to + * generate an UNDEF. + * + * - the feature is enabled, but a NV guest wants to trap the + * feature used by its L2 guest. We forward the exception in + * this case. + * + * What we don't expect is to end-up here if the guest is + * expected be be able to directly use the feature, hence the + * WARN_ON below. + */ + switch (iss) { + case ESR_ELx_ISS_OTHER_ST64BV: + allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V); + fwd &= !(hcrx & HCRX_EL2_EnASR); + break; + case ESR_ELx_ISS_OTHER_ST64BV0: + allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA); + fwd &= !(hcrx & HCRX_EL2_EnAS0); + break; + case ESR_ELx_ISS_OTHER_LDST64B: + allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64); + fwd &= !(hcrx & HCRX_EL2_EnALS); + break; + case ESR_ELx_ISS_OTHER_TSBCSYNC: + allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1); + fwd &= (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC); + break; + case ESR_ELx_ISS_OTHER_PSBCSYNC: + allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P5); + fwd &= (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC); + break; + default: + /* Clearly, we're missing something. */ + WARN_ON_ONCE(1); + allowed = false; + } + + WARN_ON_ONCE(allowed && !fwd); + + if (allowed && fwd) + kvm_inject_nested_sync(vcpu, esr); + else + kvm_inject_undefined(vcpu); + + return 1; +} + static exit_handle_fn arm_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec, [ESR_ELx_EC_WFx] = kvm_handle_wfx, @@ -205,21 +382,26 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_CP14_LS] = kvm_handle_cp14_load_store, [ESR_ELx_EC_CP10_ID] = kvm_handle_cp10_id, [ESR_ELx_EC_CP14_64] = kvm_handle_cp14_64, + [ESR_ELx_EC_OTHER] = handle_other, [ESR_ELx_EC_HVC32] = handle_hvc, [ESR_ELx_EC_SMC32] = handle_smc, [ESR_ELx_EC_HVC64] = handle_hvc, [ESR_ELx_EC_SMC64] = handle_smc, + [ESR_ELx_EC_SVC64] = handle_svc, [ESR_ELx_EC_SYS64] = kvm_handle_sys_reg, [ESR_ELx_EC_SVE] = handle_sve, + [ESR_ELx_EC_ERET] = kvm_handle_eret, [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, + [ESR_ELx_EC_DABT_CUR] = kvm_handle_vncr_abort, [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, - [ESR_ELx_EC_FP_ASIMD] = handle_no_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd, [ESR_ELx_EC_PAC] = kvm_handle_ptrauth, + [ESR_ELx_EC_GCS] = kvm_handle_gcs, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) @@ -313,7 +495,7 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) kvm_handle_guest_serror(vcpu, disr_to_esr(disr)); } else { - kvm_inject_vabt(vcpu); + kvm_inject_serror(vcpu); } return; @@ -325,6 +507,20 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } +static void print_nvhe_hyp_panic(const char *name, u64 panic_addr) +{ + kvm_err("nVHE hyp %s at: [<%016llx>] %pB!\n", name, panic_addr, + (void *)(panic_addr + kaslr_offset())); +} + +static void kvm_nvhe_report_cfi_failure(u64 panic_addr) +{ + print_nvhe_hyp_panic("CFI failure", panic_addr); + + if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) + kvm_err(" (CONFIG_CFI_PERMISSIVE ignored for hyp failures)\n"); +} + void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, @@ -337,7 +533,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) { kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && - (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) { + esr_brk_comment(esr) == BUG_BRK_IMM) { const char *file = NULL; unsigned int line = 0; @@ -353,16 +549,24 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else - kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr, - (void *)(panic_addr + kaslr_offset())); + print_nvhe_hyp_panic("BUG", panic_addr); + } else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) { + kvm_nvhe_report_cfi_failure(panic_addr); + } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) && + ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && + esr_is_ubsan_brk(esr)) { + print_nvhe_hyp_panic(report_ubsan_failure(esr & UBSAN_BRK_MASK), + panic_addr); } else { - kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr, - (void *)(panic_addr + kaslr_offset())); + print_nvhe_hyp_panic("panic", panic_addr); } /* Dump the nVHE hypervisor backtrace */ kvm_nvhe_dump_backtrace(hyp_offset); + /* Dump the faulting instruction */ + dump_kernel_instr(panic_addr + kaslr_offset()); + /* * Hyp has panicked and we're going to handle that by panicking the * kernel. The kernel offset will be revealed in the panic so we're diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index a38dea6186c9..d61e44642f98 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -3,7 +3,7 @@ # Makefile for Kernel-based Virtual Machine module, HYP part # -incdir := $(srctree)/$(src)/include +incdir := $(src)/include subdir-asflags-y := -I$(incdir) subdir-ccflags-y := -I$(incdir) diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c index f98cbe2626a1..449fa58cf3b6 100644 --- a/arch/arm64/kvm/hyp/aarch32.c +++ b/arch/arm64/kvm/hyp/aarch32.c @@ -50,9 +50,23 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) u32 cpsr_cond; int cond; - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_esr(vcpu) >> 30) + /* + * These are the exception classes that could fire with a + * conditional instruction. + */ + switch (kvm_vcpu_trap_get_class(vcpu)) { + case ESR_ELx_EC_CP15_32: + case ESR_ELx_EC_CP15_64: + case ESR_ELx_EC_CP14_MR: + case ESR_ELx_EC_CP14_LS: + case ESR_ELx_EC_FP_ASIMD: + case ESR_ELx_EC_CP10_ID: + case ESR_ELx_EC_CP14_64: + case ESR_ELx_EC_SVC32: + break; + default: return true; + } /* Is condition field valid? */ cond = kvm_vcpu_get_condition(vcpu); @@ -84,7 +98,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) } /** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block + * kvm_adjust_itstate - adjust ITSTATE when emulating instructions in IT-block * @vcpu: The VCPU pointer * * When exceptions occur while instructions are executed in Thumb IF-THEN @@ -120,7 +134,7 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) } /** - * kvm_skip_instr - skip a trapped instruction and proceed to the next + * kvm_skip_instr32 - skip a trapped instruction and proceed to the next * @vcpu: The vcpu pointer */ void kvm_skip_instr32(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 435346ea1504..9f4e8d68ab50 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN alternative_else_nop_endif mrs x1, isr_el1 cbz x1, 1f + + // Ensure that __guest_enter() always provides a context + // synchronization event so that callers don't need ISBs for anything + // that would usually be synchonized by the ERET. + isb mov x0, #ARM_EXCEPTION_IRQ ret @@ -83,6 +88,14 @@ alternative_else_nop_endif eret sb +SYM_INNER_LABEL(__guest_exit_restore_elr_and_panic, SYM_L_GLOBAL) + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack + + adr_this_cpu x0, kvm_hyp_ctxt, x1 + ldr x0, [x0, #CPU_ELR_EL2] + msr elr_el2, x0 + SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // x2-x29,lr: vcpu regs // vcpu x0-x1 on the stack @@ -171,7 +184,7 @@ alternative_else dsb sy // Synchronize against in-flight ld/st isb // Prevent an early read of side-effect free ISR mrs x2, isr_el1 - tbnz x2, #8, 2f // ISR_EL1.A + tbnz x2, #ISR_EL1_A_SHIFT, 2f ret nop 2: diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 791d3de76771..bef40ddb16db 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -14,6 +14,7 @@ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__) #error Hypervisor code only! @@ -21,33 +22,36 @@ static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { - u64 val; - - if (__vcpu_read_sys_reg_from_cpu(reg, &val)) - return val; + if (has_vhe()) + return vcpu_read_sys_reg(vcpu, reg); return __vcpu_sys_reg(vcpu, reg); } static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { - if (__vcpu_write_sys_reg_to_cpu(val, reg)) - return; - - __vcpu_sys_reg(vcpu, reg) = val; + if (has_vhe()) + vcpu_write_sys_reg(vcpu, val, reg); + else + __vcpu_assign_sys_reg(vcpu, reg, val); } -static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val) +static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, + u64 val) { - if (has_vhe()) - write_sysreg_el1(val, SYS_SPSR); - else - __vcpu_sys_reg(vcpu, SPSR_EL1) = val; + if (has_vhe()) { + if (target_mode == PSR_MODE_EL1h) + vcpu_write_sys_reg(vcpu, val, SPSR_EL1); + else + vcpu_write_sys_reg(vcpu, val, SPSR_EL2); + } else { + __vcpu_assign_sys_reg(vcpu, SPSR_EL1, val); + } } static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) { - if (has_vhe()) + if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) write_sysreg(val, spsr_abt); else vcpu->arch.ctxt.spsr_abt = val; @@ -55,7 +59,7 @@ static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val) { - if (has_vhe()) + if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) write_sysreg(val, spsr_und); else vcpu->arch.ctxt.spsr_und = val; @@ -101,6 +105,11 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1); break; + case PSR_MODE_EL2h: + vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL2); + sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL2); + __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL2); + break; default: /* Don't do that */ BUG(); @@ -153,7 +162,7 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, new |= target_mode; *vcpu_cpsr(vcpu) = new; - __vcpu_write_spsr(vcpu, old); + __vcpu_write_spsr(vcpu, target_mode, old); } /* @@ -323,11 +332,28 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR): + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_serror); + break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): + enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync); + break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL2_IRQ): + enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq); + break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR): + enter_exception64(vcpu, PSR_MODE_EL2h, except_type_serror); + break; + default: /* - * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ} - * will be implemented at some point. Everything - * else gets silently ignored. + * Only EL1_{SYNC,SERR} and EL2_{SYNC,IRQ,SERR} makes + * sense so far. Everything else gets silently + * ignored. */ break; } diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S index 61e6f3ba7b7d..e950875e31ce 100644 --- a/arch/arm64/kvm/hyp/fpsimd.S +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -25,3 +25,9 @@ SYM_FUNC_START(__sve_restore_state) sve_load 0, x1, x2, 3 ret SYM_FUNC_END(__sve_restore_state) + +SYM_FUNC_START(__sve_save_state) + mov x2, #1 + sve_save 0, x1, x2, 3 + ret +SYM_FUNC_END(__sve_save_state) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 8f3f93fa119e..03f97d71984c 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -154,6 +154,12 @@ SYM_CODE_END(\label) esb stp x0, x1, [sp, #-16]! 662: + /* + * spectre vectors __bp_harden_hyp_vecs generate br instructions at runtime + * that jump at offset 8 at __kvm_hyp_vector. + * As hyp .text is guarded section, it needs bti j. + */ + bti j b \target check_preamble_length 661b, 662b @@ -165,6 +171,8 @@ check_preamble_length 661b, 662b nop stp x0, x1, [sp, #-16]! 662: + /* Check valid_vect */ + bti j b \target check_preamble_length 661b, 662b diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 961bbef104a6..502a5b73ee70 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -88,15 +88,26 @@ default: write_debug(ptr[0], reg, 0); \ } +static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.debug_owner) { + case VCPU_DEBUG_FREE: + WARN_ON_ONCE(1); + fallthrough; + case VCPU_DEBUG_GUEST_OWNED: + return &vcpu->arch.vcpu_debug_state; + case VCPU_DEBUG_HOST_OWNED: + return &vcpu->arch.external_debug_state; + } + + return NULL; +} + static void __debug_save_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); save_debug(dbg->dbg_bcr, dbgbcr, brps); save_debug(dbg->dbg_bvr, dbgbvr, brps); @@ -109,13 +120,8 @@ static void __debug_save_state(struct kvm_guest_debug_arch *dbg, static void __debug_restore_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); restore_debug(dbg->dbg_bcr, dbgbcr, brps); restore_debug(dbg->dbg_bvr, dbgbvr, brps); @@ -132,13 +138,13 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; - host_dbg = &vcpu->arch.host_debug_state.regs; - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + host_dbg = host_data_ptr(host_debug_state.regs); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(host_dbg, host_ctxt); __debug_restore_state(guest_dbg, guest_ctxt); @@ -151,18 +157,16 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; - host_dbg = &vcpu->arch.host_debug_state.regs; - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + host_dbg = host_data_ptr(host_debug_state.regs); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - - vcpu_clear_flag(vcpu, DEBUG_DIRTY); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 9ddcfe2c3e57..fc573fc767b0 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -12,8 +12,19 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +static inline bool __fault_safe_to_translate(u64 esr) +{ + u64 fsc = esr & ESR_ELx_FSC; + + if (esr_fsc_is_sea_ttw(esr) || esr_fsc_is_secc_ttw(esr)) + return false; + + return !(fsc == ESR_ELx_FSC_EXTABT && (esr & ESR_ELx_FnV)); +} + static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { + int ret; u64 par, tmp; /* @@ -27,7 +38,9 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) + ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) : + __kvm_at(OP_AT_S1E1R, far); + if (!ret) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ @@ -41,34 +54,50 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) return true; } -static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +/* + * Checks for the conditions when HPFAR_EL2 is written, per ARM ARM R_FKLWR. + */ +static inline bool __hpfar_valid(u64 esr) { - u64 hpfar, far; - - far = read_sysreg_el2(SYS_FAR); - /* - * The HPFAR can be invalid if the stage 2 fault did not - * happen during a stage 1 page table walk (the ESR_EL2.S1PTW - * bit is clear) and one of the two following cases are true: - * 1. The fault was due to a permission fault - * 2. The processor carries errata 834220 + * CPUs affected by ARM erratum #834220 may incorrectly report a + * stage-2 translation fault when a stage-1 permission fault occurs. * - * Therefore, for all non S1PTW faults where we either have a - * permission fault or the errata workaround is enabled, we - * resolve the IPA using the AT instruction. + * Re-walk the page tables to determine if a stage-1 fault actually + * occurred. */ - if (!(esr & ESR_ELx_S1PTW) && - (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM)) { - if (!__translate_far_to_hpfar(far, &hpfar)) - return false; - } else { + if (cpus_have_final_cap(ARM64_WORKAROUND_834220) && + esr_fsc_is_translation_fault(esr)) + return false; + + if (esr_fsc_is_translation_fault(esr) || esr_fsc_is_access_flag_fault(esr)) + return true; + + if ((esr & ESR_ELx_S1PTW) && esr_fsc_is_permission_fault(esr)) + return true; + + return esr_fsc_is_addr_sz_fault(esr); +} + +static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +{ + u64 hpfar; + + fault->far_el2 = read_sysreg_el2(SYS_FAR); + fault->hpfar_el2 = 0; + + if (__hpfar_valid(esr)) hpfar = read_sysreg(hpfar_el2); - } + else if (unlikely(!__fault_safe_to_translate(esr))) + return true; + else if (!__translate_far_to_hpfar(fault->far_el2, &hpfar)) + return false; - fault->far_el2 = far; - fault->hpfar_el2 = hpfar; + /* + * Hijack HPFAR_EL2.NS (RES0 in Non-secure) to indicate a valid + * HPFAR value. + */ + fault->hpfar_el2 = hpfar | HPFAR_EL2_NS; return true; } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 07d37ff88a3f..c5d5e5b86eaf 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -26,9 +26,11 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/fpsimd.h> #include <asm/debug-monitors.h> #include <asm/processor.h> +#include <asm/traps.h> struct kvm_exception_table_entry { int insn, fixup; @@ -37,19 +39,13 @@ struct kvm_exception_table_entry { extern struct kvm_exception_table_entry __start___kvm_ex_table; extern struct kvm_exception_table_entry __stop___kvm_ex_table; -/* Check whether the FP regs are owned by the guest */ -static inline bool guest_owns_fp_regs(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.fp_state == FP_STATE_GUEST_OWNED; -} - /* Save the 32-bit only FPSIMD system register state */ static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu) { if (!vcpu_el1_is_32bit(vcpu)) return; - __vcpu_sys_reg(vcpu, FPEXC32_EL2) = read_sysreg(fpexc32_el2); + __vcpu_assign_sys_reg(vcpu, FPEXC32_EL2, read_sysreg(fpexc32_el2)); } static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) @@ -69,8 +65,241 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } +static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); +} + +static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + /* + * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to + * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, + * except for some missing controls, such as TAM. + * In this case, CPTR_EL2.TAM has the same position with or without + * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM + * shift value for trapping the AMU accesses. + */ + u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA; + u64 cptr; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } + + if (!vcpu_has_nv(vcpu)) + goto write; + + /* + * The architecture is a bit crap (what a surprise): an EL2 guest + * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, + * as they are RES0 in the guest's view. To work around it, trap the + * sucker using the very same bit it can't set... + */ + if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) + val |= CPTR_EL2_TCPAC; + + /* + * Layer the guest hypervisor's trap configuration on top of our own if + * we're in a nested context. + */ + if (is_hyp_ctxt(vcpu)) + goto write; + + cptr = vcpu_sanitised_cptr_el2(vcpu); + + /* + * Pay attention, there's some interesting detail here. + * + * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two + * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): + * + * - CPTR_EL2.xEN = x0, traps are enabled + * - CPTR_EL2.xEN = x1, traps are disabled + * + * In other words, bit[0] determines if guest accesses trap or not. In + * the interest of simplicity, clear the entire field if the guest + * hypervisor has traps enabled to dispel any illusion of something more + * complicated taking place. + */ + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; + + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + val |= cptr & CPACR_EL1_E0POE; + + val |= cptr & CPTR_EL2_TCPAC; + +write: + write_sysreg(val, cpacr_el1); +} + +static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + + if (has_vhe() || has_hvhe()) + __activate_cptr_traps_vhe(vcpu); + else + __activate_cptr_traps_nvhe(vcpu); +} + +static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1; + + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); +} + +static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN; + + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); +} + +static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_vhe() || has_hvhe()) + __deactivate_cptr_traps_vhe(vcpu); + else + __deactivate_cptr_traps_nvhe(vcpu); +} + +static inline bool cpu_has_amu(void) +{ + u64 pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); + + return cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_EL1_AMU_SHIFT); +} + +#define __activate_fgt(hctxt, vcpu, reg) \ + do { \ + ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ + write_sysreg_s(*vcpu_fgt(vcpu, reg), SYS_ ## reg); \ + } while (0) + +static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + return; + + __activate_fgt(hctxt, vcpu, HFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HFGWTR_EL2); + __activate_fgt(hctxt, vcpu, HFGITR_EL2); + __activate_fgt(hctxt, vcpu, HDFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HDFGWTR_EL2); + + if (cpu_has_amu()) + __activate_fgt(hctxt, vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __activate_fgt(hctxt, vcpu, HFGRTR2_EL2); + __activate_fgt(hctxt, vcpu, HFGWTR2_EL2); + __activate_fgt(hctxt, vcpu, HFGITR2_EL2); + __activate_fgt(hctxt, vcpu, HDFGRTR2_EL2); + __activate_fgt(hctxt, vcpu, HDFGWTR2_EL2); +} + +#define __deactivate_fgt(htcxt, vcpu, reg) \ + do { \ + write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ + SYS_ ## reg); \ + } while(0) + +static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + return; + + __deactivate_fgt(hctxt, vcpu, HFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR_EL2); + + if (cpu_has_amu()) + __deactivate_fgt(hctxt, vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __deactivate_fgt(hctxt, vcpu, HFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2); +} + +static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) +{ + u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + + if (!system_supports_mpam()) + return; + + /* trap guest access to MPAMIDR_EL1 */ + if (system_supports_mpam_hcr()) { + write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); + } else { + /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ + r |= MPAM2_EL2_TIDR; + } + + write_sysreg_s(r, SYS_MPAM2_EL2); +} + +static inline void __deactivate_traps_mpam(void) +{ + if (!system_supports_mpam()) + return; + + write_sysreg_s(0, SYS_MPAM2_EL2); + + if (system_supports_mpam_hcr()) + write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); +} + static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ write_sysreg(1 << 15, hstr_el2); @@ -80,68 +309,104 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * counter, which could make a PMXEVCNTR_EL0 access UNDEF at * EL1 instead of being trapped to EL2. */ - if (kvm_arm_support_pmu_v3()) { + if (system_supports_pmuv3()) { write_sysreg(0, pmselr_el0); + + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); } - vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - - if (cpus_have_final_cap(ARM64_SME)) { - sysreg_clear_set_s(SYS_HFGRTR_EL2, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK, - 0); - sysreg_clear_set_s(SYS_HFGWTR_EL2, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK, - 0); + if (cpus_have_final_cap(ARM64_HAS_HCX)) { + u64 hcrx = vcpu->arch.hcrx_el2; + if (is_nested_ctxt(vcpu)) { + u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2); + hcrx |= val & __HCRX_EL2_MASK; + hcrx &= ~(~val & __HCRX_EL2_nMASK); + } + + ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2); + write_sysreg_s(hcrx, SYS_HCRX_EL2); } + + __activate_traps_hfgxtr(vcpu); + __activate_traps_mpam(vcpu); } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { - write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2); + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); write_sysreg(0, hstr_el2); - if (kvm_arm_support_pmu_v3()) - write_sysreg(0, pmuserenr_el0); - - if (cpus_have_final_cap(ARM64_SME)) { - sysreg_clear_set_s(SYS_HFGRTR_EL2, 0, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK); - sysreg_clear_set_s(SYS_HFGWTR_EL2, 0, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK); + if (system_supports_pmuv3()) { + write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); + vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); } + + if (cpus_have_final_cap(ARM64_HAS_HCX)) + write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2); + + __deactivate_traps_hfgxtr(vcpu); + __deactivate_traps_mpam(); } -static inline void ___activate_traps(struct kvm_vcpu *vcpu) +static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr) { - u64 hcr = vcpu->arch.hcr_el2; - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM)) hcr |= HCR_TVM; - write_sysreg(hcr, hcr_el2); + write_sysreg_hcr(hcr); - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) - write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2); + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) { + u64 vsesr; + + /* + * When HCR_EL2.AMO is set, physical SErrors are taken to EL2 + * and vSError injection is enabled for EL1. Conveniently, for + * NV this means that it is never the case where a 'physical' + * SError (injected by KVM or userspace) and vSError are + * deliverable to the same context. + * + * As such, we can trivially select between the host or guest's + * VSESR_EL2. Except for the case that FEAT_RAS hasn't been + * exposed to the guest, where ESR propagation in hardware + * occurs unconditionally. + * + * Paper over the architectural wart and use an IMPLEMENTATION + * DEFINED ESR value in case FEAT_RAS is hidden from the guest. + */ + if (!vserror_state_is_nested(vcpu)) + vsesr = vcpu->arch.vsesr_el2; + else if (kvm_has_ras(kern_hyp_va(vcpu->kvm))) + vsesr = __vcpu_sys_reg(vcpu, VSESR_EL2); + else + vsesr = ESR_ELx_ISV; + + write_sysreg_s(vsesr, SYS_VSESR_EL2); + } } static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) { + u64 *hcr; + + if (vserror_state_is_nested(vcpu)) + hcr = __ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2); + else + hcr = &vcpu->arch.hcr_el2; + /* * If we pended a virtual abort, preserve it until it gets * cleared. See D1.14.3 (Virtual Interrupts) for details, but * the crucial bit is "On taking a vSError interrupt, * HCR_EL2.VSE is cleared to 0." + * + * Additionally, when in a nested context we need to propagate the + * updated state to the guest hypervisor's HCR_EL2. */ - if (vcpu->arch.hcr_el2 & HCR_VSE) { - vcpu->arch.hcr_el2 &= ~HCR_VSE; - vcpu->arch.hcr_el2 |= read_sysreg(hcr_el2) & HCR_VSE; + if (*hcr & HCR_VSE) { + *hcr &= ~HCR_VSE; + *hcr |= read_sysreg(hcr_el2) & HCR_VSE; } } @@ -150,25 +415,142 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } +static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + + /* + * Finish potential single step before executing the prologue + * instruction. + */ + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); + + return true; +} + static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) { + /* + * The vCPU's saved SVE state layout always matches the max VL of the + * vCPU. Start off with the max VL so we can load the SVE state. + */ sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); __sve_restore_state(vcpu_sve_pffr(vcpu), - &vcpu->arch.ctxt.fp_regs.fpsr); - write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); + &vcpu->arch.ctxt.fp_regs.fpsr, + true); + + /* + * The effective VL for a VM could differ from the max VL when running a + * nested guest, as the guest hypervisor could select a smaller VL. Slap + * that into hardware before wrapping up. + */ + if (is_nested_ctxt(vcpu)) + sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2); + + write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR); } +static inline void __hyp_sve_save_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR); + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); + __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); +} + +static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + if (vcpu_has_sve(vcpu)) { + /* A guest hypervisor may restrict the effective max VL. */ + if (is_nested_ctxt(vcpu)) + zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); + else + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); + write_sysreg_el1(zcr_el1, SYS_ZCR); + } +} + +static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + /* + * When the guest owns the FP regs, we know that guest+hyp traps for + * any FPSIMD/SVE/SME features exposed to the guest have been disabled + * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() + * prior to __guest_entry(). As __guest_entry() guarantees a context + * synchronization event, we don't need an ISB here to avoid taking + * traps for anything that was exposed to the guest. + */ + if (vcpu_has_sve(vcpu)) { + zcr_el1 = read_sysreg_el1(SYS_ZCR); + __vcpu_assign_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu), zcr_el1); + + /* + * The guest's state is always saved using the guest's max VL. + * Ensure that the host has the guest's max VL active such that + * the host can save the guest's state lazily, but don't + * artificially restrict the host to the guest's max VL. + */ + if (has_vhe()) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + } else { + zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el1(zcr_el1, SYS_ZCR); + } + } +} + +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (system_supports_sve()) { + __hyp_sve_save_host(); + } else { + __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); + } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); +} + + /* * We trap the first access to the FP/SIMD to save the host context and * restore the guest context lazily. * If FP/SIMD is not implemented, handle the trap and inject an undefined * instruction exception to the guest. Similarly for trapped SVE accesses. */ -static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest; u8 esr_ec; - u64 reg; if (!system_supports_fpsimd()) return false; @@ -176,31 +558,36 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) sve_guest = vcpu_has_sve(vcpu); esr_ec = kvm_vcpu_trap_get_class(vcpu); - /* Don't handle SVE traps for non-SVE vcpus here: */ - if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD) + /* Only handle traps the vCPU can support here: */ + switch (esr_ec) { + case ESR_ELx_EC_FP_ASIMD: + /* Forward traps to the guest hypervisor as required */ + if (guest_hyp_fpsimd_traps_enabled(vcpu)) + return false; + break; + case ESR_ELx_EC_SYS64: + if (WARN_ON_ONCE(!is_hyp_ctxt(vcpu))) + return false; + fallthrough; + case ESR_ELx_EC_SVE: + if (!sve_guest) + return false; + if (guest_hyp_sve_traps_enabled(vcpu)) + return false; + break; + default: return false; + } /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (has_vhe()) { - reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN; - if (sve_guest) - reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; - - sysreg_clear_set(cpacr_el1, 0, reg); - } else { - reg = CPTR_EL2_TFP; - if (sve_guest) - reg |= CPTR_EL2_TZ; - - sysreg_clear_set(cptr_el2, reg, 0); - } + __deactivate_cptr_traps(vcpu); isb(); /* Write out the host state if it's in the registers */ - if (vcpu->arch.fp_state == FP_STATE_HOST_OWNED) - __fpsimd_save_state(vcpu->arch.host_fpsimd_state); + if (is_protected_kvm_enabled() && host_owns_fp_regs()) + kvm_hyp_save_fpsimd_host(vcpu); /* Restore the guest state */ if (sve_guest) @@ -208,11 +595,21 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) else __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs); + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR); + /* Skip restoring fpexc32 for AArch64 guests */ if (!(read_sysreg(hcr_el2) & HCR_RW)) write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2); - vcpu->arch.fp_state = FP_STATE_GUEST_OWNED; + *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED; + + /* + * Re-enable traps necessary for the current state of the guest, e.g. + * those enabled by a guest hypervisor. The ERET to the guest will + * provide the necessary context synchronization. + */ + __activate_cptr_traps(vcpu); return true; } @@ -272,77 +669,126 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } -static inline bool esr_is_ptrauth_trap(u64 esr) +/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */ +static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt) { - switch (esr_sys64_to_sysreg(esr)) { - case SYS_APIAKEYLO_EL1: - case SYS_APIAKEYHI_EL1: - case SYS_APIBKEYLO_EL1: - case SYS_APIBKEYHI_EL1: - case SYS_APDAKEYLO_EL1: - case SYS_APDAKEYHI_EL1: - case SYS_APDBKEYLO_EL1: - case SYS_APDBKEYHI_EL1: - case SYS_APGAKEYLO_EL1: - case SYS_APGAKEYHI_EL1: - return true; - } + u64 offset = 0; - return false; -} + if (ctxt->offset.vm_offset) + offset += *kern_hyp_va(ctxt->offset.vm_offset); + if (ctxt->offset.vcpu_offset) + offset += *kern_hyp_va(ctxt->offset.vcpu_offset); -#define __ptrauth_save_key(ctxt, key) \ - do { \ - u64 __val; \ - __val = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \ - ctxt_sys_reg(ctxt, key ## KEYLO_EL1) = __val; \ - __val = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \ - ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \ -} while(0) + return offset; +} -DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); +static inline u64 compute_counter_value(struct arch_timer_context *ctxt) +{ + return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt); +} -static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code) +static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) { - struct kvm_cpu_context *ctxt; + struct arch_timer_context *ctxt; + u32 sysreg; u64 val; - if (!vcpu_has_ptrauth(vcpu)) + /* + * We only get here for 64bit guests, 32bit guests will hit + * the long and winding road all the way to the standard + * handling. Yes, it sucks to be irrelevant. + * + * Also, we only deal with non-hypervisor context here (either + * an EL1 guest, or a non-HYP context of an EL2 guest). + */ + if (is_hyp_ctxt(vcpu)) + return false; + + sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + + switch (sysreg) { + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + if (!vcpu_el2_e2h_is_set(vcpu)) + val = (val & CNTHCTL_EL1PCTEN) << 10; + + if (!(val & (CNTHCTL_EL1PCTEN << 10))) + return false; + } + + ctxt = vcpu_ptimer(vcpu); + break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + if (val & CNTHCTL_EL1TVCT) + return false; + } + + ctxt = vcpu_vtimer(vcpu); + break; + default: return false; + } - ctxt = this_cpu_ptr(&kvm_hyp_ctxt); - __ptrauth_save_key(ctxt, APIA); - __ptrauth_save_key(ctxt, APIB); - __ptrauth_save_key(ctxt, APDA); - __ptrauth_save_key(ctxt, APDB); - __ptrauth_save_key(ctxt, APGA); + val = compute_counter_value(ctxt); - vcpu_ptrauth_enable(vcpu); + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + return true; +} + +static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + int rt = kvm_vcpu_sys_get_rt(vcpu); + u64 val = vcpu_get_reg(vcpu, rt); - val = read_sysreg(hcr_el2); - val |= (HCR_API | HCR_APK); - write_sysreg(val, hcr_el2); + if (sysreg != SYS_TCR_EL1) + return false; + /* + * Affected parts do not advertise support for hardware Access Flag / + * Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying + * control bits are still functional. The architecture requires these be + * RES0 on systems that do not implement FEAT_HAFDBS. + * + * Uphold the requirements of the architecture by masking guest writes + * to TCR_EL1.{HA,HD} here. + */ + val &= ~(TCR_HD | TCR_HA); + write_sysreg_el1(val, SYS_TCR); + __kvm_skip_instr(vcpu); return true; } -static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && handle_tx2_tvm(vcpu)) return true; + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) && + handle_ampere1_tcr(vcpu)) + return true; + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && __vgic_v3_perform_cpuif_access(vcpu) == 1) return true; - if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) - return kvm_hyp_handle_ptrauth(vcpu, exit_code); + if (kvm_handle_cntxct(vcpu)) + return true; return false; } -static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) { if (static_branch_unlikely(&vgic_v3_cpuif_trap) && __vgic_v3_perform_cpuif_access(vcpu) == 1) @@ -351,23 +797,26 @@ static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } -static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, + u64 *exit_code) { if (!__populate_fault_info(vcpu)) return true; return false; } +#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault +#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault -static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) { - if (!__populate_fault_info(vcpu)) + if (kvm_hyp_handle_memory_fault(vcpu, exit_code)) return true; if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { bool valid; - valid = kvm_vcpu_trap_get_fault_type(vcpu) == ESR_ELx_FSC_FAULT && + valid = kvm_vcpu_trap_is_translation_fault(vcpu) && kvm_vcpu_dabt_isvalid(vcpu) && !kvm_vcpu_abt_issea(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); @@ -389,23 +838,16 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); - -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); - /* * Allow the hypervisor to handle the exit with an exit handler if it has one. * * Returns true if the hypervisor handled the exit, and control should go back * to the guest, or false if it hasn't. */ -static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); - exit_handler_fn fn; - - fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; - + exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; if (fn) return fn(vcpu, exit_code); @@ -435,20 +877,9 @@ static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code * the guest, false when we should restore the host state and return to the * main run loop. */ -static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - /* - * Save PSTATE early so that we can evaluate the vcpu mode - * early on. - */ - synchronize_vcpu_pstate(vcpu, exit_code); - - /* - * Check whether we want to repaint the state one way or - * another. - */ - early_exit_filter(vcpu, exit_code); - if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); @@ -478,7 +909,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) goto exit; /* Check if there's an exit handler and allow it to handle the exit. */ - if (kvm_hyp_handle_exit(vcpu, exit_code)) + if (kvm_hyp_handle_exit(vcpu, exit_code, handlers)) goto guest; exit: /* Return to the host kernel and handle the exit */ @@ -492,7 +923,7 @@ guest: static inline void __kvm_unexpected_el2_exception(void) { - extern char __guest_exit_panic[]; + extern char __guest_exit_restore_elr_and_panic[]; unsigned long addr, fixup; struct kvm_exception_table_entry *entry, *end; unsigned long elr_el2 = read_sysreg(elr_el2); @@ -514,7 +945,8 @@ static inline void __kvm_unexpected_el2_exception(void) } /* Trigger a panic after restoring the hyp context. */ - write_sysreg(__guest_exit_panic, elr_el2); + this_cpu_ptr(&kvm_hyp_ctxt)->sys_regs[ELR_EL2] = elr_el2; + write_sysreg(__guest_exit_restore_elr_and_panic, elr_el2); } #endif /* __ARM64_KVM_HYP_SWITCH_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index baa5b9b3dde5..a17cbe7582de 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -16,9 +16,51 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); + +static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; + + if (!vcpu) + vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); + + return vcpu; +} + +static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt) +{ + return host_data_ptr(host_ctxt) != ctxt; +} + +static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); + + if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu)) + return &vcpu->arch.external_mdscr_el1; + + return &ctxt_sys_reg(ctxt, MDSCR_EL1); +} + +static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm); + + if (!(ctxt_is_guest(ctxt) && + test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))) + return read_cpuid_id(); + + return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1); +} + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { - ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); + *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0); } static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) @@ -29,22 +71,84 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) { - struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; - - if (!vcpu) - vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); return kvm_has_mte(kern_hyp_va(vcpu->kvm)); } +static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_S1PIE)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_s1pie(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_TCR2)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_tcr2(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!system_supports_poe()) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_s1poe(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_ras(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_ras(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_sctlr2(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_SCTLR2)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_sctlr2(kern_hyp_va(vcpu->kvm)); +} + static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { - ctxt_sys_reg(ctxt, CSSELR_EL1) = read_sysreg(csselr_el1); ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); ctxt_sys_reg(ctxt, CPACR_EL1) = read_sysreg_el1(SYS_CPACR); ctxt_sys_reg(ctxt, TTBR0_EL1) = read_sysreg_el1(SYS_TTBR0); ctxt_sys_reg(ctxt, TTBR1_EL1) = read_sysreg_el1(SYS_TTBR1); ctxt_sys_reg(ctxt, TCR_EL1) = read_sysreg_el1(SYS_TCR); + if (ctxt_has_tcrx(ctxt)) { + ctxt_sys_reg(ctxt, TCR2_EL1) = read_sysreg_el1(SYS_TCR2); + + if (ctxt_has_s1pie(ctxt)) { + ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); + ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); + } + + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR); + } ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR); ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0); ctxt_sys_reg(ctxt, AFSR1_EL1) = read_sysreg_el1(SYS_AFSR1); @@ -65,6 +169,9 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1); ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR); ctxt_sys_reg(ctxt, SPSR_EL1) = read_sysreg_el1(SYS_SPSR); + + if (ctxt_has_sctlr2(ctxt)) + ctxt_sys_reg(ctxt, SCTLR2_EL1) = read_sysreg_el1(SYS_SCTLR2); } static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) @@ -77,13 +184,22 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) if (!has_vhe() && ctxt->__hyp_running_vcpu) ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return; + + if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt))) ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2); + else if (ctxt_has_ras(ctxt)) + ctxt_sys_reg(ctxt, VDISR_EL2) = read_sysreg_s(SYS_VDISR_EL2); } static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { - write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + write_sysreg(*ctxt_mdscr_el1(ctxt), mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0); } static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) @@ -92,10 +208,11 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0), tpidrro_el0); } -static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) +static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, + u64 midr, u64 mpidr) { - write_sysreg(ctxt_sys_reg(ctxt, MPIDR_EL1), vmpidr_el2); - write_sysreg(ctxt_sys_reg(ctxt, CSSELR_EL1), csselr_el1); + write_sysreg(midr, vpidr_el2); + write_sysreg(mpidr, vmpidr_el2); if (has_vhe() || !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { @@ -116,6 +233,17 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt_sys_reg(ctxt, CPACR_EL1), SYS_CPACR); write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL1), SYS_TTBR0); write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL1), SYS_TTBR1); + if (ctxt_has_tcrx(ctxt)) { + write_sysreg_el1(ctxt_sys_reg(ctxt, TCR2_EL1), SYS_TCR2); + + if (ctxt_has_s1pie(ctxt)) { + write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); + write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); + } + + if (ctxt_has_s1poe(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR); + } write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR1_EL1), SYS_AFSR1); @@ -154,12 +282,33 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt_sys_reg(ctxt, SP_EL1), sp_el1); write_sysreg_el1(ctxt_sys_reg(ctxt, ELR_EL1), SYS_ELR); write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1), SYS_SPSR); + + if (ctxt_has_sctlr2(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR2_EL1), SYS_SCTLR2); +} + +/* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */ +static inline u64 to_hw_pstate(const struct kvm_cpu_context *ctxt) +{ + u64 mode = ctxt->regs.pstate & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL2t: + mode = PSR_MODE_EL1t; + break; + case PSR_MODE_EL2h: + mode = PSR_MODE_EL1h; + break; + } + + return (ctxt->regs.pstate & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; } static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) { - u64 pstate = ctxt->regs.pstate; + u64 pstate = to_hw_pstate(ctxt); u64 mode = pstate & PSR_AA32_MODE_MASK; + u64 vdisr; /* * Safety check to ensure we're setting the CPU up to enter the guest @@ -178,8 +327,17 @@ static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctx write_sysreg_el2(ctxt->regs.pc, SYS_ELR); write_sysreg_el2(pstate, SYS_SPSR); - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) - write_sysreg_s(ctxt_sys_reg(ctxt, DISR_EL1), SYS_VDISR_EL2); + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return; + + if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt))) + vdisr = ctxt_sys_reg(ctxt, DISR_EL1); + else if (ctxt_has_ras(ctxt)) + vdisr = ctxt_sys_reg(ctxt, VDISR_EL2); + else + vdisr = 0; + + write_sysreg_s(vdisr, SYS_VDISR_EL2); } static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) @@ -192,11 +350,11 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) vcpu->arch.ctxt.spsr_irq = read_sysreg(spsr_irq); vcpu->arch.ctxt.spsr_fiq = read_sysreg(spsr_fiq); - __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); - __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); + __vcpu_assign_sys_reg(vcpu, DACR32_EL2, read_sysreg(dacr32_el2)); + __vcpu_assign_sys_reg(vcpu, IFSR32_EL2, read_sysreg(ifsr32_el2)); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) - __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) + __vcpu_assign_sys_reg(vcpu, DBGVCR32_EL2, read_sysreg(dbgvcr32_el2)); } static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) @@ -212,7 +370,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h new file mode 100644 index 000000000000..146e0aebfa1c --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran <qwandor@google.com> + */ +#ifndef __KVM_HYP_FFA_H +#define __KVM_HYP_FFA_H + +#include <asm/kvm_host.h> + +#define FFA_MIN_FUNC_NUM 0x60 +#define FFA_MAX_FUNC_NUM 0xFF + +int hyp_ffa_init(void *pages); +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id); + +#endif /* __KVM_HYP_FFA_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h deleted file mode 100644 index 07edfc7524c9..000000000000 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ /dev/null @@ -1,205 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2021 Google LLC - * Author: Fuad Tabba <tabba@google.com> - */ - -#ifndef __ARM64_KVM_FIXED_CONFIG_H__ -#define __ARM64_KVM_FIXED_CONFIG_H__ - -#include <asm/sysreg.h> - -/* - * This file contains definitions for features to be allowed or restricted for - * guest virtual machines, depending on the mode KVM is running in and on the - * type of guest that is running. - * - * The ALLOW masks represent a bitmask of feature fields that are allowed - * without any restrictions as long as they are supported by the system. - * - * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for - * features that are restricted to support at most the specified feature. - * - * If a feature field is not present in either, than it is not supported. - * - * The approach taken for protected VMs is to allow features that are: - * - Needed by common Linux distributions (e.g., floating point) - * - Trivial to support, e.g., supporting the feature does not introduce or - * require tracking of additional state in KVM - * - Cannot be trapped or prevent the guest from using anyway - */ - -/* - * Allow for protected VMs: - * - Floating-point and Advanced SIMD - * - Data Independent Timing - */ -#define PVM_ID_AA64PFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - AArch64 guests only (no support for AArch32 guests): - * AArch32 adds complexity in trap handling, emulation, condition codes, - * etc... - * - RAS (v1) - * Supported by KVM - */ -#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL2), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL3), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), ID_AA64PFR0_EL1_RAS_IMP) \ - ) - -/* - * Allow for protected VMs: - * - Branch Target Identification - * - Speculative Store Bypassing - */ -#define PVM_ID_AA64PFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \ - ) - -/* - * Allow for protected VMs: - * - Mixed-endian - * - Distinction between Secure and Non-secure Memory - * - Mixed-endian at EL0 only - * - Non-context synchronizing exception entry and exit - */ -#define PVM_ID_AA64MMFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - 40-bit IPA - * - 16-bit ASID - */ -#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \ - ) - -/* - * Allow for protected VMs: - * - Hardware translation table updates to Access flag and Dirty state - * - Number of VMID bits from CPU - * - Hierarchical Permission Disables - * - Privileged Access Never - * - SError interrupt exceptions from speculative reads - * - Enhanced Translation Synchronization - */ -#define PVM_ID_AA64MMFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) \ - ) - -/* - * Allow for protected VMs: - * - Common not Private translations - * - User Access Override - * - IESB bit in the SCTLR_ELx registers - * - Unaligned single-copy atomicity and atomic functions - * - ESR_ELx.EC value on an exception by read access to feature ID space - * - TTL field in address operations. - * - Break-before-make sequences when changing translation block size - * - E0PDx mechanism - */ -#define PVM_ID_AA64MMFR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \ - ) - -/* - * No support for Scalable Vectors for protected VMs: - * Requires additional support from KVM, e.g., context-switching and - * trapping at EL2 - */ -#define PVM_ID_AA64ZFR0_ALLOW (0ULL) - -/* - * No support for debug, including breakpoints, and watchpoints for protected - * VMs: - * The Arm architecture mandates support for at least the Armv8 debug - * architecture, which would include at least 2 hardware breakpoints and - * watchpoints. Providing that support to protected guests adds - * considerable state and complexity. Therefore, the reserved value of 0 is - * used for debug-related fields. - */ -#define PVM_ID_AA64DFR0_ALLOW (0ULL) -#define PVM_ID_AA64DFR1_ALLOW (0ULL) - -/* - * No support for implementation defined features. - */ -#define PVM_ID_AA64AFR0_ALLOW (0ULL) -#define PVM_ID_AA64AFR1_ALLOW (0ULL) - -/* - * No restrictions on instructions implemented in AArch64. - */ -#define PVM_ID_AA64ISAR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \ - ) - -#define PVM_ID_AA64ISAR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \ - ) - -#define PVM_ID_AA64ISAR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) \ - ) - -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); -bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); -bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); -int kvm_check_pvm_sysreg_table(void); - -#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index 0a048dc06a7d..3766333bace9 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -7,7 +7,7 @@ #include <nvhe/memory.h> #include <nvhe/spinlock.h> -#define HYP_NO_ORDER USHRT_MAX +#define HYP_NO_ORDER ((u8)(~0)) struct hyp_pool { /* @@ -16,14 +16,14 @@ struct hyp_pool { * API at EL2. */ hyp_spinlock_t lock; - struct list_head free_area[MAX_ORDER]; + struct list_head free_area[NR_PAGE_ORDERS]; phys_addr_t range_start; phys_addr_t range_end; - unsigned short max_order; + u8 max_order; }; /* Allocation */ -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order); +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order); void hyp_split_page(struct hyp_page *page); void hyp_get_page(struct hyp_pool *pool, void *addr); void hyp_put_page(struct hyp_pool *pool, void *addr); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index b7bdbe63deed..5f9d56754e39 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -11,40 +11,10 @@ #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> #include <asm/virt.h> +#include <nvhe/memory.h> #include <nvhe/pkvm.h> #include <nvhe/spinlock.h> -/* - * SW bits 0-1 are reserved to track the memory ownership state of each page: - * 00: The page is owned exclusively by the page-table owner. - * 01: The page is owned by the page-table owner, but is shared - * with another entity. - * 10: The page is shared with, but not owned by the page-table owner. - * 11: Reserved for future use (lending). - */ -enum pkvm_page_state { - PKVM_PAGE_OWNED = 0ULL, - PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, - PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, - __PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 | - KVM_PGTABLE_PROT_SW1, - - /* Meta-states which aren't encoded directly in the PTE's SW bits */ - PKVM_NOPAGE, -}; - -#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) -static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, - enum pkvm_page_state state) -{ - return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; -} - -static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) -{ - return prot & PKVM_PAGE_STATE_PROT_MASK; -} - struct host_mmu { struct kvm_arch arch; struct kvm_pgtable pgt; @@ -57,6 +27,7 @@ extern struct host_mmu host_mmu; enum pkvm_component_id { PKVM_ID_HOST, PKVM_ID_HYP, + PKVM_ID_FFA, }; extern unsigned long hyp_nr_cpus; @@ -66,6 +37,15 @@ int __pkvm_host_share_hyp(u64 pfn); int __pkvm_host_unshare_hyp(u64 pfn); int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot); +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm); +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); @@ -76,7 +56,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_pin_shared_mem(void *from, void *to); void hyp_unpin_shared_mem(void *from, void *to); -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc); @@ -87,4 +67,10 @@ static __always_inline void __load_host_stage2(void) else write_sysreg(0, vttbr_el2); } + +#ifdef CONFIG_NVHE_EL2_DEBUG +void pkvm_ownership_selftest(void *base); +#else +static inline void pkvm_ownership_selftest(void *base) { } +#endif #endif /* __KVM_NVHE_MEM_PROTECT__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index ab205c4d6774..dee1a406b0c2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -7,9 +7,61 @@ #include <linux/types.h> +/* + * Bits 0-1 are used to encode the memory ownership state of each page from the + * point of view of a pKVM "component" (host, hyp, guest, ... see enum + * pkvm_component_id): + * 00: The page is owned and exclusively accessible by the component; + * 01: The page is owned and accessible by the component, but is also + * accessible by another component; + * 10: The page is accessible but not owned by the component; + * The storage of this state depends on the component: either in the + * hyp_vmemmap for the host and hyp states or in PTE software bits for guests. + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = BIT(0), + PKVM_PAGE_SHARED_BORROWED = BIT(1), + + /* + * 'Meta-states' are not stored directly in PTE SW bits for guest + * states, but inferred from the context (e.g. invalid PTE entries). + * For the host and hyp, meta-states are stored directly in the + * struct hyp_page. + */ + PKVM_NOPAGE = BIT(0) | BIT(1), +}; +#define PKVM_PAGE_STATE_MASK (BIT(0) | BIT(1)) + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + prot &= ~PKVM_PAGE_STATE_PROT_MASK; + prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state); + return prot; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot); +} + struct hyp_page { - unsigned short refcount; - unsigned short order; + u16 refcount; + u8 order; + + /* Host state. Guarded by the host stage-2 lock. */ + unsigned __host_state : 4; + + /* + * Complement of the hyp state. Guarded by the hyp stage-1 lock. We use + * the complement so that the initial 0 in __hyp_state_comp (due to the + * entire vmemmap starting off zeroed) encodes PKVM_NOPAGE. + */ + unsigned __hyp_state_comp : 4; + + u32 host_share_guest_count; }; extern u64 __hyp_vmemmap; @@ -29,7 +81,13 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) #define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) -#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)]) + +static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys) +{ + BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u64)); + return &hyp_vmemmap[hyp_phys_to_pfn(phys)]; +} + #define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt)) #define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt)) @@ -38,6 +96,26 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) #define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) +static inline enum pkvm_page_state get_host_state(struct hyp_page *p) +{ + return p->__host_state; +} + +static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__host_state = state; +} + +static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p) +{ + return p->__hyp_state_comp ^ PKVM_PAGE_STATE_MASK; +} + +static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_MASK; +} + /* * Refcounting for 'struct hyp_page'. * hyp_pool::lock must be held if atomic access to the refcount is required. diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index d5ec972b5c1e..6e83ce35c2f2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -13,9 +13,11 @@ extern struct kvm_pgtable pkvm_pgtable; extern hyp_spinlock_t pkvm_pgd_lock; -int hyp_create_pcpu_fixmap(void); +int hyp_create_fixmap(void); void *hyp_fixmap_map(phys_addr_t phys); void hyp_fixmap_unmap(void); +void *hyp_fixblock_map(phys_addr_t phys, size_t *size); +void hyp_fixblock_unmap(void); int hyp_create_idmap(u32 hyp_va_bits); int hyp_map_vectors(void); @@ -26,6 +28,7 @@ int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot, unsigned long *haddr); +int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr); int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr); #endif /* __KVM_HYP_MM_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 82b3d62538a6..184ad7a39950 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -20,10 +20,16 @@ struct pkvm_hyp_vcpu { /* Backpointer to the host's (untrusted) vCPU instance. */ struct kvm_vcpu *host_vcpu; + + /* + * If this hyp vCPU is loaded, then this is a backpointer to the + * per-cpu pointer tracking us. Otherwise, NULL if not loaded. + */ + struct pkvm_hyp_vcpu **loaded_hyp_vcpu; }; /* - * Holds the relevant data for running a protected vm. + * Holds the relevant data for running a vm in protected mode. */ struct pkvm_hyp_vm { struct kvm kvm; @@ -37,24 +43,32 @@ struct pkvm_hyp_vm { struct hyp_pool pool; hyp_spinlock_t lock; - /* - * The number of vcpus initialized and ready to run. - * Modifying this is protected by 'vm_table_lock'. - */ - unsigned int nr_vcpus; - /* Array of the hyp vCPU structures for this VM. */ struct pkvm_hyp_vcpu *vcpus[]; }; +extern hyp_spinlock_t vm_table_lock; + static inline struct pkvm_hyp_vm * pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) { return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm); } +static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + return vcpu_is_protected(&hyp_vcpu->vcpu); +} + +static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm) +{ + return kvm_vm_is_protected(&hyp_vm->kvm); +} + void pkvm_hyp_vm_table_init(void *tbl); +int __pkvm_reserve_vm(void); +void __pkvm_unreserve_vm(pkvm_handle_t handle); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva); int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, @@ -64,5 +78,15 @@ int __pkvm_teardown_vm(pkvm_handle_t handle); struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void); + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle); +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); + +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); +int kvm_check_pvm_sysreg_table(void); #endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 45a84f0ade04..ba5382c12787 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -12,9 +12,8 @@ #include <asm/kvm_host.h> #define cpu_reg(ctxt, r) (ctxt)->regs.regs[r] -#define DECLARE_REG(type, name, ctxt, reg) \ +#define DECLARE_REG(type, name, ctxt, reg) \ + __always_unused int ___check_reg_ ## reg; \ type name = (type)cpu_reg(ctxt, (reg)) -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu); - #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 530347cdebe3..a244ec25f8c5 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -12,7 +12,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__ ccflags-y += -fno-stack-protector \ -DDISABLE_BRANCH_PROFILING \ - $(DISABLE_STACKLEAK_PLUGIN) + $(DISABLE_KSTACK_ERASE) hostprogs := gen-hyprel HOST_EXTRACFLAGS += -I$(objtree)/include @@ -20,12 +20,15 @@ HOST_EXTRACFLAGS += -I$(objtree)/include lib-objs := clear_page.o copy_page.o memcpy.o memset.o lib-objs := $(addprefix ../../../lib/, $(lib-objs)) +CFLAGS_switch.nvhe.o += -Wno-override-init + hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o -hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o +hyp-obj-y += ../../../kernel/smccc-call.o +hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o hyp-obj-y += $(lib-objs) ## @@ -89,24 +92,17 @@ quiet_cmd_hyprel = HYPREL $@ quiet_cmd_hypcopy = HYPCOPY $@ cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@ -# Remove ftrace, Shadow Call Stack, and CFI CFLAGS. -# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations. -KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS)) +# Remove ftrace and Shadow Call Stack CFLAGS. +# This is equivalent to the 'notrace' and '__noscs' annotations. +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) # Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile' # when profile optimization is applied. gen-hyprel does not support SHT_REL and # causes a build failure. Remove profile optimization flags. KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables -# KVM nVHE code is run at a different exception code with a different map, so -# compiler instrumentation that inserts callbacks or checks into the code may -# cause crashes. Just disable it. -GCOV_PROFILE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n -UBSAN_SANITIZE := n -KCOV_INSTRUMENT := n - -# Skip objtool checking for this directory because nVHE code is compiled with -# non-standard build rules. -OBJECT_FILES_NON_STANDARD := y +ifeq ($(CONFIG_UBSAN_KVM_EL2),y) +UBSAN_SANITIZE := y +# Always use brk and not hooks +ccflags-y += $(CFLAGS_UBSAN_TRAP) +endif diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index e17455773b98..2a1c0f49792b 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -27,17 +27,16 @@ static void __debug_save_spe(u64 *pmscr_el1) * Check if the host is actually using it ? */ reg = read_sysreg_s(SYS_PMBLIMITR_EL1); - if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT))) + if (!(reg & BIT(PMBLIMITR_EL1_E_SHIFT))) return; /* Yes; save the control register and disable data generation */ - *pmscr_el1 = read_sysreg_s(SYS_PMSCR_EL1); - write_sysreg_s(0, SYS_PMSCR_EL1); + *pmscr_el1 = read_sysreg_el1(SYS_PMSCR); + write_sysreg_el1(0, SYS_PMSCR); isb(); /* Now drain all buffered data to memory */ psb_csync(); - dsb(nsh); } static void __debug_restore_spe(u64 pmscr_el1) @@ -49,46 +48,88 @@ static void __debug_restore_spe(u64 pmscr_el1) isb(); /* Re-enable data generation */ - write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); + write_sysreg_el1(pmscr_el1, SYS_PMSCR); } -static void __debug_save_trace(u64 *trfcr_el1) +static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) { - *trfcr_el1 = 0; + *saved_trfcr = read_sysreg_el1(SYS_TRFCR); + write_sysreg_el1(new_trfcr, SYS_TRFCR); +} + +static bool __trace_needs_drain(void) +{ + if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE)) + return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E; + + return host_data_test_flag(TRBE_ENABLED); +} + +static bool __trace_needs_switch(void) +{ + return host_data_test_flag(TRBE_ENABLED) || + host_data_test_flag(EL1_TRACING_CONFIGURED); +} + +static void __trace_switch_to_guest(void) +{ + /* Unsupported with TRBE so disable */ + if (host_data_test_flag(TRBE_ENABLED)) + *host_data_ptr(trfcr_while_in_guest) = 0; + + __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1), + *host_data_ptr(trfcr_while_in_guest)); + + if (__trace_needs_drain()) { + isb(); + tsb_csync(); + } +} - /* Check if the TRBE is enabled */ - if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_ENABLE)) +static void __trace_switch_to_host(void) +{ + __trace_do_switch(host_data_ptr(trfcr_while_in_guest), + *host_data_ptr(host_debug_state.trfcr_el1)); +} + +static void __debug_save_brbe(u64 *brbcr_el1) +{ + *brbcr_el1 = 0; + + /* Check if the BRBE is enabled */ + if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE))) return; + /* - * Prohibit trace generation while we are in guest. - * Since access to TRFCR_EL1 is trapped, the guest can't + * Prohibit branch record generation while we are in guest. + * Since access to BRBCR_EL1 is trapped, the guest can't * modify the filtering set by the host. */ - *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1); - write_sysreg_s(0, SYS_TRFCR_EL1); - isb(); - /* Drain the trace buffer to memory */ - tsb_csync(); - dsb(nsh); + *brbcr_el1 = read_sysreg_el1(SYS_BRBCR); + write_sysreg_el1(0, SYS_BRBCR); } -static void __debug_restore_trace(u64 trfcr_el1) +static void __debug_restore_brbe(u64 brbcr_el1) { - if (!trfcr_el1) + if (!brbcr_el1) return; - /* Restore trace filter controls */ - write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1); + /* Restore BRBE controls */ + write_sysreg_el1(brbcr_el1, SYS_BRBCR); } void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) - __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); - /* Disable and flush Self-Hosted Trace generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) - __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1); + if (host_data_test_flag(HAS_SPE)) + __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1)); + + /* Disable BRBE branch records */ + if (host_data_test_flag(HAS_BRBE)) + __debug_save_brbe(host_data_ptr(host_debug_state.brbcr_el1)); + + if (__trace_needs_switch()) + __trace_switch_to_guest(); } void __debug_switch_to_guest(struct kvm_vcpu *vcpu) @@ -98,18 +139,15 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) - __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) - __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1); + if (host_data_test_flag(HAS_SPE)) + __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1)); + if (host_data_test_flag(HAS_BRBE)) + __debug_restore_brbe(*host_data_ptr(host_debug_state.brbcr_el1)); + if (__trace_needs_switch()) + __trace_switch_to_host(); } void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c new file mode 100644 index 000000000000..f731cc4c3f28 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -0,0 +1,993 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by + * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware + * Framework for Arm A-profile", which is specified by Arm in document + * number DEN0077. + * + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran <qwandor@google.com> + * + * This driver hooks into the SMC trapping logic for the host and intercepts + * all calls falling within the FF-A range. Each call is either: + * + * - Forwarded on unmodified to the SPMD at EL3 + * - Rejected as "unsupported" + * - Accompanied by a host stage-2 page-table check/update and reissued + * + * Consequently, any attempts by the host to make guest memory pages + * accessible to the secure world using FF-A will be detected either here + * (in the case that the memory is already owned by the guest) or during + * donation to the guest (in the case that the memory was previously shared + * with the secure world). + * + * To allow the rolling-back of page-table updates and FF-A calls in the + * event of failure, operations involving the RXTX buffers are locked for + * the duration and are therefore serialised. + */ + +#include <linux/arm-smccc.h> +#include <linux/arm_ffa.h> +#include <asm/kvm_pkvm.h> + +#include <nvhe/ffa.h> +#include <nvhe/mem_protect.h> +#include <nvhe/memory.h> +#include <nvhe/trap_handler.h> +#include <nvhe/spinlock.h> + +/* + * "ID value 0 must be returned at the Non-secure physical FF-A instance" + * We share this ID with the host. + */ +#define HOST_FFA_ID 0 + +/* + * A buffer to hold the maximum descriptor size we can see from the host, + * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP + * when resolving the handle on the reclaim path. + */ +struct kvm_ffa_descriptor_buffer { + void *buf; + size_t len; +}; + +static struct kvm_ffa_descriptor_buffer ffa_desc_buf; + +struct kvm_ffa_buffers { + hyp_spinlock_t lock; + void *tx; + void *rx; +}; + +/* + * Note that we don't currently lock these buffers explicitly, instead + * relying on the locking of the host FFA buffers as we only have one + * client. + */ +static struct kvm_ffa_buffers hyp_buffers; +static struct kvm_ffa_buffers host_buffers; +static u32 hyp_ffa_version; +static bool has_version_negotiated; +static hyp_spinlock_t version_lock; + +static void ffa_to_smccc_error(struct arm_smccc_1_2_regs *res, u64 ffa_errno) +{ + *res = (struct arm_smccc_1_2_regs) { + .a0 = FFA_ERROR, + .a2 = ffa_errno, + }; +} + +static void ffa_to_smccc_res_prop(struct arm_smccc_1_2_regs *res, int ret, u64 prop) +{ + if (ret == FFA_RET_SUCCESS) { + *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_SUCCESS, + .a2 = prop }; + } else { + ffa_to_smccc_error(res, ret); + } +} + +static void ffa_to_smccc_res(struct arm_smccc_1_2_regs *res, int ret) +{ + ffa_to_smccc_res_prop(res, ret, 0); +} + +static void ffa_set_retval(struct kvm_cpu_context *ctxt, + struct arm_smccc_1_2_regs *res) +{ + cpu_reg(ctxt, 0) = res->a0; + cpu_reg(ctxt, 1) = res->a1; + cpu_reg(ctxt, 2) = res->a2; + cpu_reg(ctxt, 3) = res->a3; + cpu_reg(ctxt, 4) = res->a4; + cpu_reg(ctxt, 5) = res->a5; + cpu_reg(ctxt, 6) = res->a6; + cpu_reg(ctxt, 7) = res->a7; + + /* + * DEN0028C 2.6: SMC32/HVC32 call from aarch64 must preserve x8-x30. + * + * In FF-A 1.2, we cannot rely on the function ID sent by the caller to + * detect 32-bit calls because the CPU cycle management interfaces (e.g. + * FFA_MSG_WAIT, FFA_RUN) are 32-bit only but can have 64-bit responses. + * + * FFA-1.3 introduces 64-bit variants of the CPU cycle management + * interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests + * complete with SMC32 direct responses which *should* allow us use the + * function ID sent by the caller to determine whether to return x8-x17. + * + * Note that we also cannot rely on function IDs in the response. + * + * Given the above, assume SMC64 and send back x0-x17 unconditionally + * as the passthrough code (__kvm_hyp_host_forward_smc) does the same. + */ + cpu_reg(ctxt, 8) = res->a8; + cpu_reg(ctxt, 9) = res->a9; + cpu_reg(ctxt, 10) = res->a10; + cpu_reg(ctxt, 11) = res->a11; + cpu_reg(ctxt, 12) = res->a12; + cpu_reg(ctxt, 13) = res->a13; + cpu_reg(ctxt, 14) = res->a14; + cpu_reg(ctxt, 15) = res->a15; + cpu_reg(ctxt, 16) = res->a16; + cpu_reg(ctxt, 17) = res->a17; +} + +static bool is_ffa_call(u64 func_id) +{ + return ARM_SMCCC_IS_FAST_CALL(func_id) && + ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD && + ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM && + ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM; +} + +static int ffa_map_hyp_buffers(u64 ffa_page_count) +{ + struct arm_smccc_1_2_regs res; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_RXTX_MAP, + .a1 = hyp_virt_to_phys(hyp_buffers.tx), + .a2 = hyp_virt_to_phys(hyp_buffers.rx), + .a3 = ffa_page_count, + }, &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static int ffa_unmap_hyp_buffers(void) +{ + struct arm_smccc_1_2_regs res; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RXTX_UNMAP, + .a1 = HOST_FFA_ID, + }, &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo, + u32 handle_hi, u32 fraglen, u32 endpoint_id) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_TX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fraglen, + .a4 = endpoint_id, + }, res); +} + +static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo, + u32 handle_hi, u32 fragoff) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_RX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fragoff, + .a4 = HOST_FFA_ID, + }, res); +} + +static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len, + u32 fraglen) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = func_id, + .a1 = len, + .a2 = fraglen, + }, res); +} + +static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo, + u32 handle_hi, u32 flags) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_RECLAIM, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = flags, + }, res); +} + +static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_MEM_RETRIEVE_REQ, + .a1 = len, + .a2 = len, + }, res); +} + +static void ffa_rx_release(struct arm_smccc_1_2_regs *res) +{ + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RX_RELEASE, + }, res); +} + +static void do_ffa_rxtx_map(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(phys_addr_t, tx, ctxt, 1); + DECLARE_REG(phys_addr_t, rx, ctxt, 2); + DECLARE_REG(u32, npages, ctxt, 3); + int ret = 0; + void *rx_virt, *tx_virt; + + if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (host_buffers.tx) { + ret = FFA_RET_DENIED; + goto out_unlock; + } + + /* + * Map our hypervisor buffers into the SPMD before mapping and + * pinning the host buffers in our own address space. + */ + ret = ffa_map_hyp_buffers(npages); + if (ret) + goto out_unlock; + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unmap; + } + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_tx; + } + + tx_virt = hyp_phys_to_virt(tx); + ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_rx; + } + + rx_virt = hyp_phys_to_virt(rx); + ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unpin_tx; + } + + host_buffers.tx = tx_virt; + host_buffers.rx = rx_virt; + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); + return; + +err_unpin_tx: + hyp_unpin_shared_mem(tx_virt, tx_virt + 1); +err_unshare_rx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx)); +err_unshare_tx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx)); +err_unmap: + ffa_unmap_hyp_buffers(); + goto out_unlock; +} + +static void do_ffa_rxtx_unmap(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + int ret = 0; + + if (id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx))); + host_buffers.tx = NULL; + + hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx))); + host_buffers.rx = NULL; + + ffa_unmap_hyp_buffers(); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); +} + +static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nshared = __ffa_host_share_ranges(ranges, nranges); + int ret = 0; + + if (nshared != nranges) { + WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges); + int ret = 0; + + if (nunshared != nranges) { + WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static void do_ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, fraglen, ctxt, 3); + DECLARE_REG(u32, endpoint_id, ctxt, 4); + struct ffa_mem_region_addr_range *buf; + int ret = FFA_RET_INVALID_PARAMETERS; + u32 nr_ranges; + + if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) + goto out; + + if (fraglen % sizeof(*buf)) + goto out; + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) + goto out_unlock; + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + nr_ranges = fraglen / sizeof(*buf); + + ret = ffa_host_share_ranges(buf, nr_ranges); + if (ret) { + /* + * We're effectively aborting the transaction, so we need + * to restore the global state back to what it was prior to + * transmission of the first fragment. + */ + ffa_mem_reclaim(res, handle_lo, handle_hi, 0); + WARN_ON(res->a0 != FFA_SUCCESS); + goto out_unlock; + } + + ffa_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id); + if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX) + WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges)); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + + /* + * If for any reason this did not succeed, we're in trouble as we have + * now lost the content of the previous fragments and we can't rollback + * the host stage-2 changes. The pages previously marked as shared will + * remain stuck in that state forever, hence preventing the host from + * sharing/donating them again and may possibly lead to subsequent + * failures, but this will not compromise confidentiality. + */ + return; +} + +static void __do_ffa_mem_xfer(const u64 func_id, + struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, len, ctxt, 1); + DECLARE_REG(u32, fraglen, ctxt, 2); + DECLARE_REG(u64, addr_mbz, ctxt, 3); + DECLARE_REG(u32, npages_mbz, ctxt, 4); + struct ffa_mem_region_attributes *ep_mem_access; + struct ffa_composite_mem_region *reg; + struct ffa_mem_region *buf; + u32 offset, nr_ranges, checked_offset; + int ret = 0; + + if (addr_mbz || npages_mbz || fraglen > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (fraglen < sizeof(struct ffa_mem_region) + + sizeof(struct ffa_mem_region_attributes)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + goto out_unlock; + } + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + + ep_mem_access = (void *)buf + + ffa_mem_desc_offset(buf, 0, hyp_ffa_version); + offset = ep_mem_access->composite_off; + if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (fraglen < checked_offset) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + reg = (void *)buf + offset; + nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents; + if (nr_ranges % sizeof(reg->constituents[0])) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + nr_ranges /= sizeof(reg->constituents[0]); + ret = ffa_host_share_ranges(reg->constituents, nr_ranges); + if (ret) + goto out_unlock; + + ffa_mem_xfer(res, func_id, len, fraglen); + if (fraglen != len) { + if (res->a0 != FFA_MEM_FRAG_RX) + goto err_unshare; + + if (res->a3 != fraglen) + goto err_unshare; + } else if (res->a0 != FFA_SUCCESS) { + goto err_unshare; + } + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + return; + +err_unshare: + WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges)); + goto out_unlock; +} + +#define do_ffa_mem_xfer(fid, res, ctxt) \ + do { \ + BUILD_BUG_ON((fid) != FFA_FN64_MEM_SHARE && \ + (fid) != FFA_FN64_MEM_LEND); \ + __do_ffa_mem_xfer((fid), (res), (ctxt)); \ + } while (0); + +static void do_ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, flags, ctxt, 3); + struct ffa_mem_region_attributes *ep_mem_access; + struct ffa_composite_mem_region *reg; + u32 offset, len, fraglen, fragoff; + struct ffa_mem_region *buf; + int ret = 0; + u64 handle; + + handle = PACK_HANDLE(handle_lo, handle_hi); + + hyp_spin_lock(&host_buffers.lock); + + buf = hyp_buffers.tx; + *buf = (struct ffa_mem_region) { + .sender_id = HOST_FFA_ID, + .handle = handle, + }; + + ffa_retrieve_req(res, sizeof(*buf)); + buf = hyp_buffers.rx; + if (res->a0 != FFA_MEM_RETRIEVE_RESP) + goto out_unlock; + + len = res->a1; + fraglen = res->a2; + + ep_mem_access = (void *)buf + + ffa_mem_desc_offset(buf, 0, hyp_ffa_version); + offset = ep_mem_access->composite_off; + /* + * We can trust the SPMD to get this right, but let's at least + * check that we end up with something that doesn't look _completely_ + * bogus. + */ + if (WARN_ON(offset > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { + ret = FFA_RET_ABORTED; + ffa_rx_release(res); + goto out_unlock; + } + + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + ffa_rx_release(res); + goto out_unlock; + } + + buf = ffa_desc_buf.buf; + memcpy(buf, hyp_buffers.rx, fraglen); + ffa_rx_release(res); + + for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { + ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff); + if (res->a0 != FFA_MEM_FRAG_TX) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + fraglen = res->a3; + memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); + ffa_rx_release(res); + } + + ffa_mem_reclaim(res, handle_lo, handle_hi, flags); + if (res->a0 != FFA_SUCCESS) + goto out_unlock; + + reg = (void *)buf + offset; + /* If the SPMD was happy, then we should be too. */ + WARN_ON(ffa_host_unshare_ranges(reg->constituents, + reg->addr_range_cnt)); +out_unlock: + hyp_spin_unlock(&host_buffers.lock); + + if (ret) + ffa_to_smccc_res(res, ret); +} + +/* + * Is a given FFA function supported, either by forwarding on directly + * or by handling at EL2? + */ +static bool ffa_call_supported(u64 func_id) +{ + switch (func_id) { + /* Unsupported memory management calls */ + case FFA_FN64_MEM_RETRIEVE_REQ: + case FFA_MEM_RETRIEVE_RESP: + case FFA_MEM_RELINQUISH: + case FFA_MEM_OP_PAUSE: + case FFA_MEM_OP_RESUME: + case FFA_MEM_FRAG_RX: + case FFA_FN64_MEM_DONATE: + /* Indirect message passing via RX/TX buffers */ + case FFA_MSG_SEND: + case FFA_MSG_POLL: + case FFA_MSG_WAIT: + /* 32-bit variants of 64-bit calls */ + case FFA_MSG_SEND_DIRECT_RESP: + case FFA_RXTX_MAP: + case FFA_MEM_DONATE: + case FFA_MEM_RETRIEVE_REQ: + /* Optional notification interfaces added in FF-A 1.1 */ + case FFA_NOTIFICATION_BITMAP_CREATE: + case FFA_NOTIFICATION_BITMAP_DESTROY: + case FFA_NOTIFICATION_BIND: + case FFA_NOTIFICATION_UNBIND: + case FFA_NOTIFICATION_SET: + case FFA_NOTIFICATION_GET: + case FFA_NOTIFICATION_INFO_GET: + /* Optional interfaces added in FF-A 1.2 */ + case FFA_MSG_SEND_DIRECT_REQ2: /* Optional per 7.5.1 */ + case FFA_MSG_SEND_DIRECT_RESP2: /* Optional per 7.5.1 */ + case FFA_CONSOLE_LOG: /* Optional per 13.1: not in Table 13.1 */ + case FFA_PARTITION_INFO_GET_REGS: /* Optional for virtual instances per 13.1 */ + return false; + } + + return true; +} + +static bool do_ffa_features(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + u64 prop = 0; + int ret = 0; + + if (!ffa_call_supported(id)) { + ret = FFA_RET_NOT_SUPPORTED; + goto out_handled; + } + + switch (id) { + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + ret = FFA_RET_SUCCESS; + prop = 0; /* No support for dynamic buffers */ + goto out_handled; + default: + return false; + } + +out_handled: + ffa_to_smccc_res_prop(res, ret, prop); + return true; +} + +static int hyp_ffa_post_init(void) +{ + size_t min_rxtx_sz; + struct arm_smccc_1_2_regs res; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_ID_GET, + }, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + if (res.a2 != HOST_FFA_ID) + return -EINVAL; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_FEATURES, + .a1 = FFA_FN64_RXTX_MAP, + }, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) { + case FFA_FEAT_RXTX_MIN_SZ_4K: + min_rxtx_sz = SZ_4K; + break; + case FFA_FEAT_RXTX_MIN_SZ_16K: + min_rxtx_sz = SZ_16K; + break; + case FFA_FEAT_RXTX_MIN_SZ_64K: + min_rxtx_sz = SZ_64K; + break; + default: + return -EINVAL; + } + + if (min_rxtx_sz > PAGE_SIZE) + return -EOPNOTSUPP; + + return 0; +} + +static void do_ffa_version(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, ffa_req_version, ctxt, 1); + + if (FFA_MAJOR_VERSION(ffa_req_version) != 1) { + res->a0 = FFA_RET_NOT_SUPPORTED; + return; + } + + hyp_spin_lock(&version_lock); + if (has_version_negotiated) { + if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) + res->a0 = FFA_RET_NOT_SUPPORTED; + else + res->a0 = hyp_ffa_version; + goto unlock; + } + + /* + * If the client driver tries to downgrade the version, we need to ask + * first if TEE supports it. + */ + if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) { + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = ffa_req_version, + }, res); + if (res->a0 == FFA_RET_NOT_SUPPORTED) + goto unlock; + + hyp_ffa_version = ffa_req_version; + } + + if (hyp_ffa_post_init()) { + res->a0 = FFA_RET_NOT_SUPPORTED; + } else { + smp_store_release(&has_version_negotiated, true); + res->a0 = hyp_ffa_version; + } +unlock: + hyp_spin_unlock(&version_lock); +} + +static void do_ffa_part_get(struct arm_smccc_1_2_regs *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, uuid0, ctxt, 1); + DECLARE_REG(u32, uuid1, ctxt, 2); + DECLARE_REG(u32, uuid2, ctxt, 3); + DECLARE_REG(u32, uuid3, ctxt, 4); + DECLARE_REG(u32, flags, ctxt, 5); + u32 count, partition_sz, copy_sz; + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.rx) { + ffa_to_smccc_res(res, FFA_RET_BUSY); + goto out_unlock; + } + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_PARTITION_INFO_GET, + .a1 = uuid0, + .a2 = uuid1, + .a3 = uuid2, + .a4 = uuid3, + .a5 = flags, + }, res); + + if (res->a0 != FFA_SUCCESS) + goto out_unlock; + + count = res->a2; + if (!count) + goto out_unlock; + + if (hyp_ffa_version > FFA_VERSION_1_0) { + /* Get the number of partitions deployed in the system */ + if (flags & 0x1) + goto out_unlock; + + partition_sz = res->a3; + } else { + /* FFA_VERSION_1_0 lacks the size in the response */ + partition_sz = FFA_1_0_PARTITON_INFO_SZ; + } + + copy_sz = partition_sz * count; + if (copy_sz > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { + ffa_to_smccc_res(res, FFA_RET_ABORTED); + goto out_unlock; + } + + memcpy(host_buffers.rx, hyp_buffers.rx, copy_sz); +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +} + +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) +{ + struct arm_smccc_1_2_regs res; + + /* + * There's no way we can tell what a non-standard SMC call might + * be up to. Ideally, we would terminate these here and return + * an error to the host, but sadly devices make use of custom + * firmware calls for things like power management, debugging, + * RNG access and crash reporting. + * + * Given that the architecture requires us to trust EL3 anyway, + * we forward unrecognised calls on under the assumption that + * the firmware doesn't expose a mechanism to access arbitrary + * non-secure memory. Short of a per-device table of SMCs, this + * is the best we can do. + */ + if (!is_ffa_call(func_id)) + return false; + + if (func_id != FFA_VERSION && + !smp_load_acquire(&has_version_negotiated)) { + ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS); + goto out_handled; + } + + switch (func_id) { + case FFA_FEATURES: + if (!do_ffa_features(&res, host_ctxt)) + return false; + goto out_handled; + /* Memory management */ + case FFA_FN64_RXTX_MAP: + do_ffa_rxtx_map(&res, host_ctxt); + goto out_handled; + case FFA_RXTX_UNMAP: + do_ffa_rxtx_unmap(&res, host_ctxt); + goto out_handled; + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt); + goto out_handled; + case FFA_MEM_RECLAIM: + do_ffa_mem_reclaim(&res, host_ctxt); + goto out_handled; + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt); + goto out_handled; + case FFA_MEM_FRAG_TX: + do_ffa_mem_frag_tx(&res, host_ctxt); + goto out_handled; + case FFA_VERSION: + do_ffa_version(&res, host_ctxt); + goto out_handled; + case FFA_PARTITION_INFO_GET: + do_ffa_part_get(&res, host_ctxt); + goto out_handled; + } + + if (ffa_call_supported(func_id)) + return false; /* Pass through */ + + ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED); +out_handled: + ffa_set_retval(host_ctxt, &res); + return true; +} + +int hyp_ffa_init(void *pages) +{ + struct arm_smccc_1_2_regs res; + void *tx, *rx; + + if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) + return 0; + + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = FFA_VERSION_1_2, + }, &res); + if (res.a0 == FFA_RET_NOT_SUPPORTED) + return 0; + + /* + * Firmware returns the maximum supported version of the FF-A + * implementation. Check that the returned version is + * backwards-compatible with the hyp according to the rules in DEN0077A + * v1.1 REL0 13.2.1. + * + * Of course, things are never simple when dealing with firmware. v1.1 + * broke ABI with v1.0 on several structures, which is itself + * incompatible with the aforementioned versioning scheme. The + * expectation is that v1.x implementations that do not support the v1.0 + * ABI return NOT_SUPPORTED rather than a version number, according to + * DEN0077A v1.1 REL0 18.6.4. + */ + if (FFA_MAJOR_VERSION(res.a0) != 1) + return -EOPNOTSUPP; + + if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_2)) + hyp_ffa_version = res.a0; + else + hyp_ffa_version = FFA_VERSION_1_2; + + tx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + rx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + + ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) { + .buf = pages, + .len = PAGE_SIZE * + (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)), + }; + + hyp_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + .tx = tx, + .rx = rx, + }; + + host_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + }; + + version_lock = __HYP_SPIN_LOCK_UNLOCKED; + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c index 6bc88a756cb7..b63f4e1c1033 100644 --- a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c +++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c @@ -50,6 +50,9 @@ #ifndef R_AARCH64_ABS64 #define R_AARCH64_ABS64 257 #endif +#ifndef R_AARCH64_ABS32 +#define R_AARCH64_ABS32 258 +#endif #ifndef R_AARCH64_PREL64 #define R_AARCH64_PREL64 260 #endif @@ -383,6 +386,9 @@ static void emit_rela_section(Elf64_Shdr *sh_rela) case R_AARCH64_ABS64: emit_rela_abs64(rela, sh_orig_name); break; + /* Allow 32-bit absolute relocation, for kCFI type hashes. */ + case R_AARCH64_ABS32: + break; /* Allow position-relative data relocations. */ case R_AARCH64_PREL64: case R_AARCH64_PREL32: diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index b6c0188c4b35..eef15b374abb 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -10,6 +10,7 @@ #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_ptrauth.h> .text @@ -37,10 +38,43 @@ SYM_FUNC_START(__host_exit) /* Save the host context pointer in x29 across the function call */ mov x29, x0 + +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL +alternative_if_not ARM64_HAS_ADDRESS_AUTH +b __skip_pauth_save +alternative_else_nop_endif + +alternative_if ARM64_KVM_PROTECTED_MODE + /* Save kernel ptrauth keys. */ + add x18, x29, #CPU_APIAKEYLO_EL1 + ptrauth_save_state x18, x19, x20 + + /* Use hyp keys. */ + adr_this_cpu x18, kvm_hyp_ctxt, x19 + add x18, x18, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state x18, x19, x20 + isb +alternative_else_nop_endif +__skip_pauth_save: +#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ + bl handle_trap - /* Restore host regs x0-x17 */ __host_enter_restore_full: + /* Restore kernel keys. */ +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL +alternative_if_not ARM64_HAS_ADDRESS_AUTH +b __skip_pauth_restore +alternative_else_nop_endif + +alternative_if ARM64_KVM_PROTECTED_MODE + add x18, x29, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state x18, x19, x20 +alternative_else_nop_endif +__skip_pauth_restore: +#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ + + /* Restore host regs x0-x17 */ ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] @@ -76,7 +110,7 @@ SYM_FUNC_END(__host_enter) * u64 elr, u64 par); */ SYM_FUNC_START(__hyp_do_panic) - /* Prepare and exit to the host's panic funciton. */ + /* Prepare and exit to the host's panic function. */ mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) msr spsr_el2, lr @@ -90,7 +124,7 @@ SYM_FUNC_START(__hyp_do_panic) /* Ensure host stage-2 is disabled */ mrs x0, hcr_el2 bic x0, x0, #HCR_VM - msr hcr_el2, x0 + msr_hcr_el2 x0 isb tlbi vmalls12e1 dsb nsh @@ -154,21 +188,15 @@ SYM_FUNC_END(__host_hvc) /* * Test whether the SP has overflowed, without corrupting a GPR. - * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit + * nVHE hypervisor stacks are aligned so that the NVHE_STACK_SHIFT bit * of SP should always be 1. */ add sp, sp, x0 // sp' = sp + x0 sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp - tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@ + tbz x0, #NVHE_STACK_SHIFT, .L__hyp_sp_overflow\@ sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp - /* If a guest is loaded, panic out of it. */ - stp x0, x1, [sp, #-16]! - get_loaded_vcpu x0, x1 - cbnz x0, __guest_exit_panic - add sp, sp, #16 - /* * The panic may not be clean if the exception is taken before the host * context has been saved by __host_exit or after the hyp context has @@ -263,3 +291,13 @@ SYM_CODE_START(__kvm_hyp_host_forward_smc) ret SYM_CODE_END(__kvm_hyp_host_forward_smc) + +/* + * kvm_host_psci_cpu_entry is called through br instruction, which requires + * bti j instruction as compilers (gcc and llvm) doesn't insert bti j for external + * functions, but bti c instead. + */ +SYM_CODE_START(kvm_host_psci_cpu_entry) + bti j + b __kvm_host_psci_cpu_entry +SYM_CODE_END(kvm_host_psci_cpu_entry) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index c953fb4b9a13..aada42522e7b 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -5,6 +5,7 @@ */ #include <linux/arm-smccc.h> +#include <linux/cfi_types.h> #include <linux/linkage.h> #include <asm/alternative.h> @@ -23,28 +24,25 @@ .align 11 SYM_CODE_START(__kvm_hyp_init) - ventry __invalid // Synchronous EL2t - ventry __invalid // IRQ EL2t - ventry __invalid // FIQ EL2t - ventry __invalid // Error EL2t + ventry . // Synchronous EL2t + ventry . // IRQ EL2t + ventry . // FIQ EL2t + ventry . // Error EL2t - ventry __invalid // Synchronous EL2h - ventry __invalid // IRQ EL2h - ventry __invalid // FIQ EL2h - ventry __invalid // Error EL2h + ventry . // Synchronous EL2h + ventry . // IRQ EL2h + ventry . // FIQ EL2h + ventry . // Error EL2h ventry __do_hyp_init // Synchronous 64-bit EL1 - ventry __invalid // IRQ 64-bit EL1 - ventry __invalid // FIQ 64-bit EL1 - ventry __invalid // Error 64-bit EL1 + ventry . // IRQ 64-bit EL1 + ventry . // FIQ 64-bit EL1 + ventry . // Error 64-bit EL1 - ventry __invalid // Synchronous 32-bit EL1 - ventry __invalid // IRQ 32-bit EL1 - ventry __invalid // FIQ 32-bit EL1 - ventry __invalid // Error 32-bit EL1 - -__invalid: - b . + ventry . // Synchronous 32-bit EL1 + ventry . // IRQ 32-bit EL1 + ventry . // FIQ 32-bit EL1 + ventry . // Error 32-bit EL1 /* * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers. @@ -57,6 +55,7 @@ __do_hyp_init: cmp x0, #HVC_STUB_HCALL_NR b.lo __kvm_handle_stub_hvc + bic x0, x0, #ARM_SMCCC_CALL_HINTS mov x3, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) cmp x0, x3 b.eq 1f @@ -75,6 +74,17 @@ __do_hyp_init: SYM_CODE_END(__kvm_hyp_init) /* + * Initialize EL2 CPU state to sane values. + * + * HCR_EL2.E2H must have been initialized already. + */ +SYM_CODE_START_LOCAL(__kvm_init_el2_state) + init_el2_state // Clobbers x0..x2 + finalise_el2_state + ret +SYM_CODE_END(__kvm_init_el2_state) + +/* * Initialize the hypervisor in EL2. * * Only uses x0..x2 so as to not clobber callee-saved SMCCC registers @@ -83,9 +93,6 @@ SYM_CODE_END(__kvm_hyp_init) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START_LOCAL(___kvm_hyp_init) - ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] - msr tpidr_el2, x1 - ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA] mov sp, x1 @@ -93,7 +100,26 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) msr mair_el2, x1 ldr x1, [x0, #NVHE_INIT_HCR_EL2] - msr hcr_el2, x1 + msr_hcr_el2 x1 + + mov x2, #HCR_E2H + and x2, x1, x2 + cbz x2, 1f + + // hVHE: Replay the EL2 setup to account for the E2H bit + // TPIDR_EL2 is used to preserve x0 across the macro maze... + isb + msr tpidr_el2, x0 + str lr, [x0, #NVHE_INIT_TMP] + + bl __kvm_init_el2_state + + mrs x0, tpidr_el2 + ldr lr, [x0, #NVHE_INIT_TMP] + +1: + ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] + msr tpidr_el2, x1 ldr x1, [x0, #NVHE_INIT_VTTBR] msr vttbr_el2, x1 @@ -108,18 +134,14 @@ alternative_if ARM64_HAS_CNP alternative_else_nop_endif msr ttbr0_el2, x2 - /* - * Set the PS bits in TCR_EL2. - */ ldr x0, [x0, #NVHE_INIT_TCR_EL2] - tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 msr tcr_el2, x0 isb /* Invalidate the stale TLBs from Bootloader */ tlbi alle2 - tlbi vmalls12e1 + tlbi alle1 dsb sy mov_q x0, INIT_SCTLR_EL2_MMU_ON @@ -128,6 +150,13 @@ alternative_if ARM64_HAS_ADDRESS_AUTH SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) orr x0, x0, x1 alternative_else_nop_endif + +#ifdef CONFIG_ARM64_BTI_KERNEL +alternative_if ARM64_BTI + orr x0, x0, #SCTLR_EL2_BT +alternative_else_nop_endif +#endif /* CONFIG_ARM64_BTI_KERNEL */ + msr sctlr_el2, x0 isb @@ -181,8 +210,9 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) 2: msr SPsel, #1 // We want to use SP_EL{1,2} - /* Initialize EL2 CPU state to sane values. */ - init_el2_state // Clobbers x0..x2 + init_el2_hcr 0 + + bl __kvm_init_el2_state /* Enable MMU, set vectors and stack. */ mov x0, x28 @@ -195,6 +225,11 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) SYM_CODE_END(__kvm_hyp_init_cpu) SYM_CODE_START(__kvm_handle_stub_hvc) + /* + * __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so + * we need bti j at beginning. + */ + bti j cmp x0, #HVC_SOFT_RESTART b.ne 1f @@ -227,7 +262,7 @@ reset: alternative_if ARM64_KVM_PROTECTED_MODE mov_q x5, HCR_HOST_NVHE_FLAGS - msr hcr_el2, x5 + msr_hcr_el2 x5 alternative_else_nop_endif /* Install stub vectors */ @@ -241,31 +276,38 @@ alternative_else_nop_endif SYM_CODE_END(__kvm_handle_stub_hvc) -SYM_FUNC_START(__pkvm_init_switch_pgd) +/* + * void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp, + * void (*fn)(void)); + * + * SYM_TYPED_FUNC_START() allows C to call this ID-mapped function indirectly + * using a physical pointer without triggering a kCFI failure. + */ +SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd) /* Turn the MMU off */ pre_disable_mmu_workaround - mrs x2, sctlr_el2 - bic x3, x2, #SCTLR_ELx_M - msr sctlr_el2, x3 + mrs x3, sctlr_el2 + bic x4, x3, #SCTLR_ELx_M + msr sctlr_el2, x4 isb tlbi alle2 /* Install the new pgtables */ - ldr x3, [x0, #NVHE_INIT_PGD_PA] - phys_to_ttbr x4, x3 + phys_to_ttbr x5, x0 alternative_if ARM64_HAS_CNP - orr x4, x4, #TTBR_CNP_BIT + orr x5, x5, #TTBR_CNP_BIT alternative_else_nop_endif - msr ttbr0_el2, x4 + msr ttbr0_el2, x5 /* Set the new stack pointer */ - ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA] - mov sp, x0 + mov sp, x1 /* And turn the MMU back on! */ - set_sctlr_el2 x2 - ret x1 + dsb nsh + isb + set_sctlr_el2 x3 + ret x2 SYM_FUNC_END(__pkvm_init_switch_pgd) .popsection diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 728e01d4536b..a7c689152f68 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -5,6 +5,7 @@ */ #include <hyp/adjust_pc.h> +#include <hyp/switch.h> #include <asm/pgtable-types.h> #include <asm/kvm_asm.h> @@ -13,6 +14,7 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include <nvhe/ffa.h> #include <nvhe/mem_protect.h> #include <nvhe/mm.h> #include <nvhe/pkvm.h> @@ -22,26 +24,114 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); -static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) +{ + __vcpu_assign_sys_reg(vcpu, ZCR_EL1, read_sysreg_el1(SYS_ZCR)); + /* + * On saving/restoring guest sve state, always use the maximum VL for + * the guest. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) guest VL. + */ + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); +} + +static void __hyp_sve_restore_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + /* + * On saving/restoring host sve state, always use the maximum VL for + * the host. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) host VL. + * + * Note that this constrains the PE to the maximum shared VL + * that was discovered, if we wish to use larger VLs this will + * need to be revisited. + */ + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); + __sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); + write_sysreg_el1(sve_state->zcr_el1, SYS_ZCR); +} + +static void fpsimd_sve_flush(void) +{ + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + +static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) +{ + bool has_fpmr; + + if (!guest_owns_fp_regs()) + return; + + /* + * Traps have been disabled by __deactivate_cptr_traps(), but there + * hasn't necessarily been a context synchronization event yet. + */ + isb(); + + if (vcpu_has_sve(vcpu)) + __hyp_sve_save_guest(vcpu); + else + __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + + has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); + if (has_fpmr) + __vcpu_assign_sys_reg(vcpu, FPMR, read_sysreg_s(SYS_FPMR)); + + if (system_supports_sve()) + __hyp_sve_restore_host(); + else + __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); + + if (has_fpmr) + write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); + + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + +static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; - hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; + hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner; - hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); - hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl; + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state; +} + +static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state; +} + +static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; - hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; + fpsimd_sve_flush(); + flush_debug_state(hyp_vcpu); + + hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; - hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; - hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2; + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); + hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & + (HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; - hyp_vcpu->vcpu.arch.fp_state = host_vcpu->arch.fp_state; - - hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr); - hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state; hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; @@ -55,35 +145,75 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; unsigned int i; + fpsimd_sve_sync(&hyp_vcpu->vcpu); + sync_debug_state(hyp_vcpu); + host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2; - host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2; host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault; host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; - host_vcpu->arch.fp_state = hyp_vcpu->vcpu.arch.fp_state; host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr; + host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr; for (i = 0; i < hyp_cpu_if->used_lrs; ++i) host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; } +static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2); + DECLARE_REG(u64, hcr_el2, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx); + if (!hyp_vcpu) + return; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + /* Propagate WFx trapping flags */ + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI); + hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI); + } +} + +static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (hyp_vcpu) + pkvm_put_hyp_vcpu(hyp_vcpu); +} + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1); int ret; - host_vcpu = kern_hyp_va(host_vcpu); - if (unlikely(is_protected_kvm_enabled())) { - struct pkvm_hyp_vcpu *hyp_vcpu; - struct kvm *host_kvm; + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + + /* + * KVM (and pKVM) doesn't support SME guests for now, and + * ensures that SME features aren't enabled in pstate when + * loading a vcpu. Therefore, if SME features enabled the host + * is misbehaving. + */ + if (unlikely(system_supports_sme() && read_sysreg_s(SYS_SVCR))) { + ret = -EINVAL; + goto out; + } - host_kvm = kern_hyp_va(host_vcpu->kvm); - hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, - host_vcpu->vcpu_idx); if (!hyp_vcpu) { ret = -EINVAL; goto out; @@ -94,12 +224,149 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) ret = __kvm_vcpu_run(&hyp_vcpu->vcpu); sync_hyp_vcpu(hyp_vcpu); - pkvm_put_hyp_vcpu(hyp_vcpu); } else { + struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu); + /* The host is fully trusted, run its vCPU directly. */ - ret = __kvm_vcpu_run(host_vcpu); + fpsimd_lazy_switch_to_guest(vcpu); + ret = __kvm_vcpu_run(vcpu); + fpsimd_lazy_switch_to_host(vcpu); } +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, + host_vcpu->arch.pkvm_memcache.nr_pages, + &host_vcpu->arch.pkvm_memcache); +} + +static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + ret = pkvm_refill_memcache(hyp_vcpu); + if (ret) + goto out; + + ret = __pkvm_host_share_guest(pfn, gfn, nr_pages, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_unshare_guest(gfn, nr_pages, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 2); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_relax_perms_guest(gfn, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_wrprotect_guest(gfn, nr_pages, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(bool, mkold, host_ctxt, 4); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_test_clear_young_guest(gfn, nr_pages, mkold, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_mkyoung_guest(gfn, hyp_vcpu); out: cpu_reg(host_ctxt, 1) = ret; } @@ -125,6 +392,25 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); } +static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2); + DECLARE_REG(int, level, host_ctxt, 3); + + __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level); +} + +static void +handle___kvm_tlb_flush_vmid_range(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, start, host_ctxt, 2); + DECLARE_REG(unsigned long, pages, host_ctxt, 3); + + __kvm_tlb_flush_vmid_range(kern_hyp_va(mmu), start, pages); +} + static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -132,6 +418,22 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); } +static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + struct pkvm_hyp_vm *hyp_vm; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + return; + + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + put_pkvm_hyp_vm(hyp_vm); +} + static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -158,26 +460,11 @@ static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); } -static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) -{ - cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr(); -} - -static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt) -{ - __vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1)); -} - static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) { __vgic_v3_init_lrs(); } -static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt) -{ - cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2(); -} - static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); @@ -185,11 +472,11 @@ static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); } -static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); - __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); + __vgic_v3_restore_vmcr_aprs(kern_hyp_va(cpu_if)); } static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt) @@ -260,11 +547,16 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } -static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_reserve_vm(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + cpu_reg(host_ctxt, 1) = __pkvm_reserve_vm(); +} - __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); +static void handle___pkvm_unreserve_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + __pkvm_unreserve_vm(handle); } static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) @@ -300,7 +592,6 @@ typedef void (*hcall_t)(struct kvm_cpu_context *); static const hcall_t host_hcall[] = { /* ___kvm_hyp_init */ - HANDLE_FUNC(__kvm_get_mdcr_el2), HANDLE_FUNC(__pkvm_init), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_cpu_set_vector), @@ -311,21 +602,31 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_share_guest), + HANDLE_FUNC(__pkvm_host_unshare_guest), + HANDLE_FUNC(__pkvm_host_relax_perms_guest), + HANDLE_FUNC(__pkvm_host_wrprotect_guest), + HANDLE_FUNC(__pkvm_host_test_clear_young_guest), + HANDLE_FUNC(__pkvm_host_mkyoung_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), + HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh), HANDLE_FUNC(__kvm_tlb_flush_vmid), + HANDLE_FUNC(__kvm_tlb_flush_vmid_range), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__vgic_v3_read_vmcr), - HANDLE_FUNC(__vgic_v3_write_vmcr), HANDLE_FUNC(__vgic_v3_save_aprs), - HANDLE_FUNC(__vgic_v3_restore_aprs), - HANDLE_FUNC(__pkvm_vcpu_init_traps), + HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), + HANDLE_FUNC(__pkvm_reserve_vm), + HANDLE_FUNC(__pkvm_unreserve_vm), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), + HANDLE_FUNC(__pkvm_vcpu_load), + HANDLE_FUNC(__pkvm_vcpu_put), + HANDLE_FUNC(__pkvm_tlb_flush_vmid), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) @@ -346,6 +647,7 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) if (static_branch_unlikely(&kvm_protected_mode_initialized)) hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize; + id &= ~ARM_SMCCC_CALL_HINTS; id -= KVM_HOST_SMCCC_ID(0); if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall))) @@ -370,9 +672,14 @@ static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt) static void handle_host_smc(struct kvm_cpu_context *host_ctxt) { + DECLARE_REG(u64, func_id, host_ctxt, 0); bool handled; - handled = kvm_host_psci_handler(host_ctxt); + func_id &= ~ARM_SMCCC_CALL_HINTS; + + handled = kvm_host_psci_handler(host_ctxt, func_id); + if (!handled) + handled = kvm_host_ffa_handler(host_ctxt, func_id); if (!handled) default_host_smc_handler(host_ctxt); @@ -391,11 +698,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_SMC64: handle_host_smc(host_ctxt); break; - case ESR_ELx_EC_SVE: - sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); - isb(); - sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); - break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: handle_host_mem_abort(host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index f4562f417d3f..d724f6d69302 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -25,5 +25,7 @@ SECTIONS { BEGIN_HYP_SECTION(.data..percpu) PERCPU_INPUT(L1_CACHE_BYTES) END_HYP_SECTION + HYP_SECTION(.bss) + HYP_SECTION(.data) } diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c index d68abd7ea124..baa6260f88dc 100644 --- a/arch/arm64/kvm/hyp/nvhe/list_debug.c +++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c @@ -17,7 +17,7 @@ static inline __must_check bool nvhe_check_data_corruption(bool v) bool corruption = unlikely(condition); \ if (corruption) { \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ - BUG_ON(1); \ + BUG(); \ } else \ WARN_ON(1); \ } \ @@ -26,8 +26,9 @@ static inline __must_check bool nvhe_check_data_corruption(bool v) /* The predicates checked here are taken from lib/list_debug.c. */ -bool __list_add_valid(struct list_head *new, struct list_head *prev, - struct list_head *next) +__list_valid_slowpath +bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev, + struct list_head *next) { if (NVHE_CHECK_DATA_CORRUPTION(next->prev != prev) || NVHE_CHECK_DATA_CORRUPTION(prev->next != next) || @@ -37,7 +38,8 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, return true; } -bool __list_del_entry_valid(struct list_head *entry) +__list_valid_slowpath +bool __list_del_entry_valid_or_report(struct list_head *entry) { struct list_head *prev, *next; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 552653fa18be..49db32f3ddf7 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -60,6 +60,11 @@ static void hyp_unlock_component(void) hyp_spin_unlock(&pkvm_pgd_lock); } +#define for_each_hyp_page(__p, __st, __sz) \ + for (struct hyp_page *__p = hyp_phys_to_page(__st), \ + *__e = __p + ((__sz) >> PAGE_SHIFT); \ + __p < __e; __p++) + static void *host_s2_zalloc_pages_exact(size_t size) { void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size)); @@ -91,9 +96,9 @@ static void host_s2_put_page(void *addr) hyp_put_page(&host_s2_pool, addr); } -static void host_s2_free_removed_table(void *addr, u32 level) +static void host_s2_free_unlinked_table(void *addr, s8 level) { - kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level); + kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level); } static int prepare_s2_pool(void *pgt_pool_base) @@ -110,7 +115,7 @@ static int prepare_s2_pool(void *pgt_pool_base) host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_pages_exact = host_s2_zalloc_pages_exact, .zalloc_page = host_s2_zalloc_page, - .free_removed_table = host_s2_free_removed_table, + .free_unlinked_table = host_s2_free_unlinked_table, .phys_to_virt = hyp_phys_to_virt, .virt_to_phys = hyp_virt_to_phys, .page_count = hyp_page_count, @@ -129,8 +134,8 @@ static void prepare_host_vtcr(void) parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val); phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); - host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, - id_aa64mmfr1_el1_sys_val, phys_shift); + host_mmu.arch.mmu.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, + id_aa64mmfr1_el1_sys_val, phys_shift); } static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot); @@ -161,12 +166,6 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) return 0; } -static bool guest_stage2_force_pte_cb(u64 addr, u64 end, - enum kvm_pgtable_prot prot) -{ - return true; -} - static void *guest_s2_zalloc_pages_exact(size_t size) { void *addr = hyp_alloc_pages(¤t_vm->pool, get_order(size)); @@ -201,8 +200,8 @@ static void *guest_s2_zalloc_page(void *mc) memset(addr, 0, PAGE_SIZE); p = hyp_virt_to_page(addr); - memset(p, 0, sizeof(*p)); p->refcount = 1; + p->order = 0; return addr; } @@ -217,16 +216,42 @@ static void guest_s2_put_page(void *addr) hyp_put_page(¤t_vm->pool, addr); } +static void __apply_guest_page(void *va, size_t size, + void (*func)(void *addr, size_t size)) +{ + size += va - PTR_ALIGN_DOWN(va, PAGE_SIZE); + va = PTR_ALIGN_DOWN(va, PAGE_SIZE); + size = PAGE_ALIGN(size); + + while (size) { + size_t map_size = PAGE_SIZE; + void *map; + + if (IS_ALIGNED((unsigned long)va, PMD_SIZE) && size >= PMD_SIZE) + map = hyp_fixblock_map(__hyp_pa(va), &map_size); + else + map = hyp_fixmap_map(__hyp_pa(va)); + + func(map, map_size); + + if (map_size == PMD_SIZE) + hyp_fixblock_unmap(); + else + hyp_fixmap_unmap(); + + size -= map_size; + va += map_size; + } +} + static void clean_dcache_guest_page(void *va, size_t size) { - __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); - hyp_fixmap_unmap(); + __apply_guest_page(va, size, __clean_dcache_guest_page); } static void invalidate_icache_guest_page(void *va, size_t size) { - __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); - hyp_fixmap_unmap(); + __apply_guest_page(va, size, __invalidate_icache_guest_page); } int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) @@ -235,7 +260,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) unsigned long nr_pages; int ret; - nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT; + nr_pages = kvm_pgtable_stage2_pgd_size(mmu->vtcr) >> PAGE_SHIFT; ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0); if (ret) return ret; @@ -255,8 +280,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) }; guest_lock_component(vm); - ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, - guest_stage2_force_pte_cb); + ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, NULL); guest_unlock_component(vm); if (ret) return ret; @@ -266,8 +290,9 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) return 0; } -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) { + struct hyp_page *page; void *addr; /* Dump all pgtable pages in the hyp_pool */ @@ -279,7 +304,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) /* Drain the hyp_pool into the memcache */ addr = hyp_alloc_pages(&vm->pool, 0); while (addr) { - memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); + page = hyp_virt_to_page(addr); + page->refcount = 0; + page->order = 0; push_hyp_memcache(mc, addr, hyp_virt_to_phys); WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); addr = hyp_alloc_pages(&vm->pool, 0); @@ -295,11 +322,18 @@ int __pkvm_prot_finalize(void) return -EPERM; params->vttbr = kvm_get_vttbr(mmu); - params->vtcr = host_mmu.arch.vtcr; + params->vtcr = mmu->vtcr; params->hcr_el2 |= HCR_VM; + + /* + * The CMO below not only cleans the updated params to the + * PoC, but also provides the DSB that ensures ongoing + * page-table walks that have started before we trapped to EL2 + * have completed. + */ kvm_flush_dcache_to_poc(params, sizeof(*params)); - write_sysreg(params->hcr_el2, hcr_el2); + write_sysreg_hcr(params->hcr_el2); __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); /* @@ -333,6 +367,19 @@ static int host_stage2_unmap_dev_all(void) return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); } +/* + * Ensure the PFN range is contained within PA-range. + * + * This check is also robust to overflows and is therefore a requirement before + * using a pfn/nr_pages pair from an untrusted source. + */ +static bool pfn_range_is_valid(u64 pfn, u64 nr_pages) +{ + u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT); + + return pfn < limit && ((limit - pfn) >= nr_pages); +} + struct kvm_mem_range { u64 start; u64 end; @@ -375,19 +422,28 @@ bool addr_is_memory(phys_addr_t phys) return !!find_mem_range(phys, &range); } -static bool addr_is_allowed_memory(phys_addr_t phys) +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + +static int check_range_allowed_memory(u64 start, u64 end) { struct memblock_region *reg; struct kvm_mem_range range; - reg = find_mem_range(phys, &range); + /* + * Callers can't check the state of a range that overlaps memory and + * MMIO regions, so ensure [start, end[ is in the same kvm_mem_range. + */ + reg = find_mem_range(start, &range); + if (!is_in_mem_range(end - 1, &range)) + return -EINVAL; - return reg && !(reg->flags & MEMBLOCK_NOMAP); -} + if (!reg || reg->flags & MEMBLOCK_NOMAP) + return -EPERM; -static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) -{ - return range->start <= addr && addr < range->end; + return 0; } static bool range_is_memory(u64 start, u64 end) @@ -436,7 +492,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) { struct kvm_mem_range cur; kvm_pte_t pte; - u32 level; + u64 granule; + s8 level; int ret; hyp_assert_lock_held(&host_mmu.lock); @@ -447,21 +504,27 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) if (kvm_pte_valid(pte)) return -EAGAIN; - if (pte) + if (pte) { + WARN_ON(addr_is_memory(addr) && + get_host_state(hyp_phys_to_page(addr)) != PKVM_NOPAGE); return -EPERM; + } - do { - u64 granule = kvm_granule_size(level); + for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) { + if (!kvm_level_supports_block_mapping(level)) + continue; + granule = kvm_granule_size(level); cur.start = ALIGN_DOWN(addr, granule); cur.end = cur.start + granule; - level++; - } while ((level < KVM_PGTABLE_MAX_LEVELS) && - !(kvm_level_supports_block_mapping(level) && - range_included(&cur, range))); + if (!range_included(&cur, range)) + continue; + *range = cur; + return 0; + } - *range = cur; + WARN_ON(1); - return 0; + return -EINVAL; } int host_stage2_idmap_locked(phys_addr_t addr, u64 size, @@ -470,10 +533,31 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); } +static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state) +{ + for_each_hyp_page(page, addr, size) + set_host_state(page, state); +} + int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { - return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, - addr, size, &host_s2_pool, owner_id); + int ret; + + if (!range_is_memory(addr, addr + size)) + return -EPERM; + + ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, + addr, size, &host_s2_pool, owner_id); + if (ret) + return ret; + + /* Don't forget to update the vmemmap tracking for the host */ + if (owner_id == PKVM_ID_HOST) + __host_update_page_state(addr, size, PKVM_PAGE_OWNED); + else + __host_update_page_state(addr, size, PKVM_NOPAGE); + + return 0; } static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) @@ -526,49 +610,29 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) int ret = 0; esr = read_sysreg_el2(SYS_ESR); - BUG_ON(!__get_fault_info(esr, &fault)); + if (!__get_fault_info(esr, &fault)) { + /* + * We've presumably raced with a page-table change which caused + * AT to fail, try again. + */ + return; + } + + + /* + * Yikes, we couldn't resolve the fault IPA. This should reinject an + * abort into the host when we figure out how to do that. + */ + BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS)); + addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12; - addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; ret = host_stage2_idmap(addr); BUG_ON(ret && ret != -EAGAIN); } -struct pkvm_mem_transition { - u64 nr_pages; - - struct { - enum pkvm_component_id id; - /* Address in the initiator's address space */ - u64 addr; - - union { - struct { - /* Address in the completer's address space */ - u64 completer_addr; - } host; - struct { - u64 completer_addr; - } hyp; - }; - } initiator; - - struct { - enum pkvm_component_id id; - } completer; -}; - -struct pkvm_mem_share { - const struct pkvm_mem_transition tx; - const enum kvm_pgtable_prot completer_prot; -}; - -struct pkvm_mem_donation { - const struct pkvm_mem_transition tx; -}; - struct check_walk_data { enum pkvm_page_state desired; - enum pkvm_page_state (*get_page_state)(kvm_pte_t pte); + enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr); }; static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx, @@ -576,10 +640,7 @@ static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx, { struct check_walk_data *d = ctx->arg; - if (kvm_pte_valid(ctx->old) && !addr_is_allowed_memory(kvm_pte_to_phys(ctx->old))) - return -EINVAL; - - return d->get_page_state(ctx->old) == d->desired ? 0 : -EPERM; + return d->get_page_state(ctx->old, ctx->addr) == d->desired ? 0 : -EPERM; } static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, @@ -594,637 +655,742 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, return kvm_pgtable_walk(pgt, addr, size, &walker); } -static enum pkvm_page_state host_get_page_state(kvm_pte_t pte) -{ - if (!kvm_pte_valid(pte) && pte) - return PKVM_NOPAGE; - - return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); -} - static int __host_check_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - struct check_walk_data d = { - .desired = state, - .get_page_state = host_get_page_state, - }; + int ret; + + ret = check_range_allowed_memory(addr, addr + size); + if (ret) + return ret; hyp_assert_lock_held(&host_mmu.lock); - return check_page_state_range(&host_mmu.pgt, addr, size, &d); + + for_each_hyp_page(page, addr, size) { + if (get_host_state(page) != state) + return -EPERM; + } + + return 0; } static int __host_set_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state); + if (get_host_state(hyp_phys_to_page(addr)) == PKVM_NOPAGE) { + int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); - return host_stage2_idmap_locked(addr, size, prot); -} + if (ret) + return ret; + } -static int host_request_owned_transition(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; + __host_update_page_state(addr, size, state); - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED); + return 0; } -static int host_request_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +static void __hyp_set_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) { - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); + for_each_hyp_page(page, phys, size) + set_hyp_state(page, state); } -static int host_initiate_share(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) { - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; + for_each_hyp_page(page, phys, size) { + if (get_hyp_state(page) != state) + return -EPERM; + } - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); + return 0; } -static int host_initiate_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) { - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; + if (!kvm_pte_valid(pte)) + return PKVM_NOPAGE; - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); + return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); } -static int host_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr, + u64 size, enum pkvm_page_state state) { - u8 owner_id = tx->completer.id; - u64 size = tx->nr_pages * PAGE_SIZE; - - *completer_addr = tx->initiator.host.completer_addr; - return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id); -} + struct check_walk_data d = { + .desired = state, + .get_page_state = guest_get_page_state, + }; -static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) -{ - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HYP); + hyp_assert_lock_held(&vm->lock); + return check_page_state_range(&vm->pgt, addr, size, &d); } -static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx, - enum pkvm_page_state state) +int __pkvm_host_share_hyp(u64 pfn) { - u64 size = tx->nr_pages * PAGE_SIZE; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE; + int ret; - if (__host_ack_skip_pgtable_check(tx)) - return 0; + host_lock_component(); + hyp_lock_component(); - return __host_check_page_state_range(addr, size, state); -} + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; -static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - return __host_ack_transition(addr, tx, PKVM_NOPAGE); -} + __hyp_set_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED)); -static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u8 host_id = tx->completer.id; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return host_stage2_set_owner_locked(addr, size, host_id); + return ret; } -static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte) +int __pkvm_host_unshare_hyp(u64 pfn) { - if (!kvm_pte_valid(pte)) - return PKVM_NOPAGE; + u64 phys = hyp_pfn_to_phys(pfn); + u64 virt = (u64)__hyp_va(phys); + u64 size = PAGE_SIZE; + int ret; - return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); -} + host_lock_component(); + hyp_lock_component(); -static int __hyp_check_page_state_range(u64 addr, u64 size, - enum pkvm_page_state state) -{ - struct check_walk_data d = { - .desired = state, - .get_page_state = hyp_get_page_state, - }; + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + if (hyp_page_count((void *)virt)) { + ret = -EBUSY; + goto unlock; + } - hyp_assert_lock_held(&pkvm_pgd_lock); - return check_page_state_range(&pkvm_pgtable, addr, size, &d); -} + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED)); -static int hyp_request_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; +unlock: + hyp_unlock_component(); + host_unlock_component(); - *completer_addr = tx->initiator.hyp.completer_addr; - return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED); + return ret; } -static int hyp_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) { - u64 size = tx->nr_pages * PAGE_SIZE; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + void *virt = __hyp_va(phys); int ret; - *completer_addr = tx->initiator.hyp.completer_addr; - ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size); - return (ret != size) ? -EFAULT : 0; -} + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; -static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) -{ - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HOST); -} + host_lock_component(); + hyp_lock_component(); -static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) -{ - u64 size = tx->nr_pages * PAGE_SIZE; + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; - if (perms != PAGE_HYP) - return -EPERM; + __hyp_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, PAGE_HYP)); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP)); - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); + return ret; } -static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) { - u64 size = tx->nr_pages * PAGE_SIZE; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + u64 virt = (u64)__hyp_va(phys); + int ret; - if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr)) - return -EBUSY; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; + host_lock_component(); + hyp_lock_component(); - return __hyp_check_page_state_range(addr, size, - PKVM_PAGE_SHARED_BORROWED); -} + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; -static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST)); - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); + return ret; } -static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) +int hyp_pin_shared_mem(void *from, void *to) { - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); - enum kvm_pgtable_prot prot; + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + u64 phys = __hyp_pa(start); + u64 size = end - start; + struct hyp_page *p; + int ret; - prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED); - return pkvm_create_mappings_locked(start, end, prot); -} + host_lock_component(); + hyp_lock_component(); -static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; - return (ret != size) ? -EFAULT : 0; -} + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; -static int hyp_complete_donation(u64 addr, - const struct pkvm_mem_transition *tx) -{ - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); - enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + hyp_page_ref_inc(p); + if (p->refcount == 1) + WARN_ON(pkvm_create_mappings_locked((void *)cur, + (void *)cur + PAGE_SIZE, + PAGE_HYP)); + } - return pkvm_create_mappings_locked(start, end, prot); +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; } -static int check_share(struct pkvm_mem_share *share) +void hyp_unpin_shared_mem(void *from, void *to) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; - int ret; - - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + struct hyp_page *p; - if (ret) - return ret; + host_lock_component(); + hyp_lock_component(); - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_share(completer_addr, tx, share->completer_prot); - break; - default: - ret = -EINVAL; + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + if (p->refcount == 1) + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, cur, PAGE_SIZE) != PAGE_SIZE); + hyp_page_ref_dec(p); } - return ret; + hyp_unlock_component(); + host_unlock_component(); } -static int __do_share(struct pkvm_mem_share *share) +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_share(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - if (ret) - return ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_share(completer_addr, tx, share->completer_prot); - break; - default: - ret = -EINVAL; - } + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + host_unlock_component(); return ret; } -/* - * do_share(): - * - * The page owner grants access to another component with a given set - * of permissions. - * - * Initiator: OWNED => SHARED_OWNED - * Completer: NOPAGE => SHARED_BORROWED - */ -static int do_share(struct pkvm_mem_share *share) +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) { + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; int ret; - ret = check_share(share); - if (ret) - return ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; - return WARN_ON(__do_share(share)); + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + host_unlock_component(); + + return ret; } -static int check_unshare(struct pkvm_mem_share *share) +static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *size) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; - int ret; + size_t block_size; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; + if (nr_pages == 1) { + *size = PAGE_SIZE; + return 0; } - if (ret) - return ret; + /* We solely support second to last level huge mapping */ + block_size = kvm_granule_size(KVM_PGTABLE_LAST_LEVEL - 1); - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_unshare(completer_addr, tx); - break; - default: - ret = -EINVAL; - } + if (nr_pages != block_size >> PAGE_SHIFT) + return -EINVAL; - return ret; + if (!IS_ALIGNED(phys | ipa, block_size)) + return -EINVAL; + + *size = block_size; + return 0; } -static int __do_unshare(struct pkvm_mem_share *share) +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys = hyp_pfn_to_phys(pfn); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 size; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + + ret = __guest_check_transition_size(phys, ipa, nr_pages, &size); + if (ret) + return ret; + ret = check_range_allowed_memory(phys, phys + size); if (ret) return ret; - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_unshare(completer_addr, tx); - break; - default: - ret = -EINVAL; + host_lock_component(); + guest_lock_component(vm); + + ret = __guest_check_page_state_range(vm, ipa, size, PKVM_NOPAGE); + if (ret) + goto unlock; + + for_each_hyp_page(page, phys, size) { + switch (get_host_state(page)) { + case PKVM_PAGE_OWNED: + continue; + case PKVM_PAGE_SHARED_OWNED: + if (page->host_share_guest_count == U32_MAX) { + ret = -EBUSY; + goto unlock; + } + + /* Only host to np-guest multi-sharing is tolerated */ + if (page->host_share_guest_count) + continue; + + fallthrough; + default: + ret = -EPERM; + goto unlock; + } + } + + for_each_hyp_page(page, phys, size) { + set_host_state(page, PKVM_PAGE_SHARED_OWNED); + page->host_share_guest_count++; } + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, size, phys, + pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + return ret; } -/* - * do_unshare(): - * - * The page owner revokes access from another component for a range of - * pages which were previously shared using do_share(). - * - * Initiator: SHARED_OWNED => OWNED - * Completer: SHARED_BORROWED => NOPAGE - */ -static int do_unshare(struct pkvm_mem_share *share) +static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa, u64 size) { + enum pkvm_page_state state; + kvm_pte_t pte; + u64 phys; + s8 level; int ret; - ret = check_unshare(share); + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); if (ret) return ret; + if (!kvm_pte_valid(pte)) + return -ENOENT; + if (size && kvm_granule_size(level) != size) + return -E2BIG; - return WARN_ON(__do_unshare(share)); -} - -static int check_donation(struct pkvm_mem_donation *donation) -{ - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; - int ret; + if (!size) + size = kvm_granule_size(level); - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_request_donation(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + state = guest_get_page_state(pte, ipa); + if (state != PKVM_PAGE_SHARED_BORROWED) + return -EPERM; - if (ret) + phys = kvm_pte_to_phys(pte); + ret = check_range_allowed_memory(phys, phys + size); + if (WARN_ON(ret)) return ret; - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_ack_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_ack_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; + for_each_hyp_page(page, phys, size) { + if (get_host_state(page) != PKVM_PAGE_SHARED_OWNED) + return -EPERM; + if (WARN_ON(!page->host_share_guest_count)) + return -EINVAL; } - return ret; + *__phys = phys; + + return 0; } -static int __do_donate(struct pkvm_mem_donation *donation) +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) { - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; + u64 ipa = hyp_pfn_to_phys(gfn); + u64 size, phys; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_donation(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_initiate_donation(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); if (ret) return ret; - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_complete_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_complete_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - return ret; -} + host_lock_component(); + guest_lock_component(vm); -/* - * do_donate(): - * - * The page owner transfers ownership to another component, losing access - * as a consequence. - * - * Initiator: OWNED => NOPAGE - * Completer: NOPAGE => OWNED - */ -static int do_donate(struct pkvm_mem_donation *donation) -{ - int ret; + ret = __check_host_shared_guest(vm, &phys, ipa, size); + if (ret) + goto unlock; - ret = check_donation(donation); + ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, size); if (ret) - return ret; + goto unlock; + + for_each_hyp_page(page, phys, size) { + /* __check_host_shared_guest() protects against underflow */ + page->host_share_guest_count--; + if (!page->host_share_guest_count) + set_host_state(page, PKVM_PAGE_OWNED); + } + +unlock: + guest_unlock_component(vm); + host_unlock_component(); - return WARN_ON(__do_donate(donation)); + return ret; } -int __pkvm_host_share_hyp(u64 pfn) +static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa, u64 size) { + u64 phys; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; + + if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) + return; host_lock_component(); - hyp_lock_component(); + guest_lock_component(vm); - ret = do_share(&share); + ret = __check_host_shared_guest(vm, &phys, ipa, size); - hyp_unlock_component(); + guest_unlock_component(vm); host_unlock_component(); - return ret; + WARN_ON(ret && ret != -ENOENT); } -int __pkvm_host_unshare_hyp(u64 pfn) +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) { + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; - host_lock_component(); - hyp_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = do_unshare(&share); + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; - hyp_unlock_component(); - host_unlock_component(); + assert_host_shared_guest(vm, ipa, 0); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); + guest_unlock_component(vm); return ret; } -int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) { + u64 size, ipa = hyp_pfn_to_phys(gfn); int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - }; - host_lock_component(); - hyp_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = do_donate(&donation); + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; - hyp_unlock_component(); - host_unlock_component(); + assert_host_shared_guest(vm, ipa, size); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, size); + guest_unlock_component(vm); return ret; } -int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm) { + u64 size, ipa = hyp_pfn_to_phys(gfn); int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HYP, - .addr = hyp_addr, - .hyp = { - .completer_addr = host_addr, - }, - }, - .completer = { - .id = PKVM_ID_HOST, - }, - }, - }; - host_lock_component(); - hyp_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = do_donate(&donation); + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; - hyp_unlock_component(); - host_unlock_component(); + assert_host_shared_guest(vm, ipa, size); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, size, mkold); + guest_unlock_component(vm); return ret; } -int hyp_pin_shared_mem(void *from, void *to) +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) { - u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); - u64 end = PAGE_ALIGN((u64)to); - u64 size = end - start; - int ret; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); - host_lock_component(); - hyp_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = __host_check_page_state_range(__hyp_pa(start), size, - PKVM_PAGE_SHARED_OWNED); - if (ret) - goto unlock; + assert_host_shared_guest(vm, ipa, 0); + guest_lock_component(vm); + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); + guest_unlock_component(vm); - ret = __hyp_check_page_state_range(start, size, - PKVM_PAGE_SHARED_BORROWED); - if (ret) - goto unlock; + return 0; +} - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_inc(hyp_virt_to_page(cur)); +#ifdef CONFIG_NVHE_EL2_DEBUG +struct pkvm_expected_state { + enum pkvm_page_state host; + enum pkvm_page_state hyp; + enum pkvm_page_state guest[2]; /* [ gfn, gfn + 1 ] */ +}; -unlock: - hyp_unlock_component(); - host_unlock_component(); +static struct pkvm_expected_state selftest_state; +static struct hyp_page *selftest_page; - return ret; +static struct pkvm_hyp_vm selftest_vm = { + .kvm = { + .arch = { + .mmu = { + .arch = &selftest_vm.kvm.arch, + .pgt = &selftest_vm.pgt, + }, + }, + }, +}; + +static struct pkvm_hyp_vcpu selftest_vcpu = { + .vcpu = { + .arch = { + .hw_mmu = &selftest_vm.kvm.arch.mmu, + }, + .kvm = &selftest_vm.kvm, + }, +}; + +static void init_selftest_vm(void *virt) +{ + struct hyp_page *p = hyp_virt_to_page(virt); + int i; + + selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + + for (i = 0; i < pkvm_selftest_pages(); i++) { + if (p[i].refcount) + continue; + p[i].refcount = 1; + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } } -void hyp_unpin_shared_mem(void *from, void *to) +static u64 selftest_ipa(void) { - u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); - u64 end = PAGE_ALIGN((u64)to); + return BIT(selftest_vm.pgt.ia_bits - 1); +} - host_lock_component(); - hyp_lock_component(); +static void assert_page_state(void) +{ + void *virt = hyp_page_to_virt(selftest_page); + u64 size = PAGE_SIZE << selftest_page->order; + struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; + u64 phys = hyp_virt_to_phys(virt); + u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE }; + struct pkvm_hyp_vm *vm; - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_dec(hyp_virt_to_page(cur)); + vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); - hyp_unlock_component(); + host_lock_component(); + WARN_ON(__host_check_page_state_range(phys, size, selftest_state.host)); host_unlock_component(); -} + + hyp_lock_component(); + WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp)); + hyp_unlock_component(); + + guest_lock_component(&selftest_vm); + WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0])); + WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1])); + guest_unlock_component(&selftest_vm); +} + +#define assert_transition_res(res, fn, ...) \ + do { \ + WARN_ON(fn(__VA_ARGS__) != res); \ + assert_page_state(); \ + } while (0) + +void pkvm_ownership_selftest(void *base) +{ + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX; + void *virt = hyp_alloc_pages(&host_s2_pool, 0); + struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; + struct pkvm_hyp_vm *vm = &selftest_vm; + u64 phys, size, pfn, gfn; + + WARN_ON(!virt); + selftest_page = hyp_virt_to_page(virt); + selftest_page->refcount = 0; + init_selftest_vm(base); + + size = PAGE_SIZE << selftest_page->order; + phys = hyp_virt_to_phys(virt); + pfn = hyp_phys_to_pfn(phys); + gfn = hyp_phys_to_pfn(selftest_ipa()); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + selftest_state.guest[0] = selftest_state.guest[1] = PKVM_NOPAGE; + assert_page_state(); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + hyp_unpin_shared_mem(virt, virt + size); + WARN_ON(hyp_page_count(virt) != 1); + assert_transition_res(-EBUSY, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + hyp_unpin_shared_mem(virt, virt + size); + assert_page_state(); + WARN_ON(hyp_page_count(virt)); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_hyp, pfn); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.guest[0] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + WARN_ON(hyp_virt_to_page(virt)->host_share_guest_count != 2); + + selftest_state.guest[0] = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn, 1, vm); + + selftest_state.guest[1] = PKVM_NOPAGE; + selftest_state.host = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1); + + selftest_page->refcount = 1; + hyp_put_page(&host_s2_pool, virt); +} +#endif diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 318298eb3d6b..ae8391baebc3 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -44,6 +44,27 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size, return err; } +static int __pkvm_alloc_private_va_range(unsigned long start, size_t size) +{ + unsigned long cur; + + hyp_assert_lock_held(&pkvm_pgd_lock); + + if (!start || start < __io_map_base) + return -EINVAL; + + /* The allocated size is always a multiple of PAGE_SIZE */ + cur = start + PAGE_ALIGN(size); + + /* Are we overflowing on the vmemmap ? */ + if (cur > __hyp_vmemmap) + return -ENOMEM; + + __io_map_base = cur; + + return 0; +} + /** * pkvm_alloc_private_va_range - Allocates a private VA range. * @size: The size of the VA range to reserve. @@ -56,27 +77,16 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size, */ int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr) { - unsigned long base, addr; - int ret = 0; + unsigned long addr; + int ret; hyp_spin_lock(&pkvm_pgd_lock); - - /* Align the allocation based on the order of its size */ - addr = ALIGN(__io_map_base, PAGE_SIZE << get_order(size)); - - /* The allocated size is always a multiple of PAGE_SIZE */ - base = addr + PAGE_ALIGN(size); - - /* Are we overflowing on the vmemmap ? */ - if (!addr || base > __hyp_vmemmap) - ret = -ENOMEM; - else { - __io_map_base = base; - *haddr = addr; - } - + addr = __io_map_base; + ret = __pkvm_alloc_private_va_range(addr, size); hyp_spin_unlock(&pkvm_pgd_lock); + *haddr = addr; + return ret; } @@ -145,7 +155,7 @@ int hyp_back_vmemmap(phys_addr_t back) start = hyp_memory[i].base; start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE); /* - * The begining of the hyp_vmemmap region for the current + * The beginning of the hyp_vmemmap region for the current * memblock may already be backed by the page backing the end * the previous region, so avoid mapping it twice. */ @@ -219,9 +229,8 @@ int hyp_map_vectors(void) return 0; } -void *hyp_fixmap_map(phys_addr_t phys) +static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys) { - struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots); kvm_pte_t pte, *ptep = slot->ptep; pte = *ptep; @@ -233,10 +242,21 @@ void *hyp_fixmap_map(phys_addr_t phys) return (void *)slot->addr; } +void *hyp_fixmap_map(phys_addr_t phys) +{ + return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys); +} + static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) { kvm_pte_t *ptep = slot->ptep; u64 addr = slot->addr; + u32 level; + + if (FIELD_GET(KVM_PTE_TYPE, *ptep) == KVM_PTE_TYPE_PAGE) + level = KVM_PGTABLE_LAST_LEVEL; + else + level = KVM_PGTABLE_LAST_LEVEL - 1; /* create_fixblock() guarantees PMD level */ WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID); @@ -250,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 */ dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1)); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); dsb(ish); isb(); } @@ -263,9 +283,9 @@ void hyp_fixmap_unmap(void) static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg); + struct hyp_fixmap_slot *slot = (struct hyp_fixmap_slot *)ctx->arg; - if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 1) + if (!kvm_pte_valid(ctx->old) || (ctx->end - ctx->start) != kvm_granule_size(ctx->level)) return -EINVAL; slot->addr = ctx->addr; @@ -286,13 +306,84 @@ static int create_fixmap_slot(u64 addr, u64 cpu) struct kvm_pgtable_walker walker = { .cb = __create_fixmap_slot_cb, .flags = KVM_PGTABLE_WALK_LEAF, - .arg = (void *)cpu, + .arg = per_cpu_ptr(&fixmap_slots, cpu), }; return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker); } -int hyp_create_pcpu_fixmap(void) +#if PAGE_SHIFT < 16 +#define HAS_FIXBLOCK +static struct hyp_fixmap_slot hyp_fixblock_slot; +static DEFINE_HYP_SPINLOCK(hyp_fixblock_lock); +#endif + +static int create_fixblock(void) +{ +#ifdef HAS_FIXBLOCK + struct kvm_pgtable_walker walker = { + .cb = __create_fixmap_slot_cb, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &hyp_fixblock_slot, + }; + unsigned long addr; + phys_addr_t phys; + int ret, i; + + /* Find a RAM phys address, PMD aligned */ + for (i = 0; i < hyp_memblock_nr; i++) { + phys = ALIGN(hyp_memory[i].base, PMD_SIZE); + if (phys + PMD_SIZE < (hyp_memory[i].base + hyp_memory[i].size)) + break; + } + + if (i >= hyp_memblock_nr) + return -EINVAL; + + hyp_spin_lock(&pkvm_pgd_lock); + addr = ALIGN(__io_map_base, PMD_SIZE); + ret = __pkvm_alloc_private_va_range(addr, PMD_SIZE); + if (ret) + goto unlock; + + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PMD_SIZE, phys, PAGE_HYP); + if (ret) + goto unlock; + + ret = kvm_pgtable_walk(&pkvm_pgtable, addr, PMD_SIZE, &walker); + +unlock: + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +#else + return 0; +#endif +} + +void *hyp_fixblock_map(phys_addr_t phys, size_t *size) +{ +#ifdef HAS_FIXBLOCK + *size = PMD_SIZE; + hyp_spin_lock(&hyp_fixblock_lock); + return fixmap_map_slot(&hyp_fixblock_slot, phys); +#else + *size = PAGE_SIZE; + return hyp_fixmap_map(phys); +#endif +} + +void hyp_fixblock_unmap(void) +{ +#ifdef HAS_FIXBLOCK + fixmap_clear_slot(&hyp_fixblock_slot); + hyp_spin_unlock(&hyp_fixblock_lock); +#else + hyp_fixmap_unmap(); +#endif +} + +int hyp_create_fixmap(void) { unsigned long addr, i; int ret; @@ -312,7 +403,7 @@ int hyp_create_pcpu_fixmap(void) return ret; } - return 0; + return create_fixblock(); } int hyp_create_idmap(u32 hyp_va_bits) @@ -340,6 +431,45 @@ int hyp_create_idmap(u32 hyp_va_bits) return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC); } +int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) +{ + unsigned long addr, prev_base; + size_t size; + int ret; + + hyp_spin_lock(&pkvm_pgd_lock); + + prev_base = __io_map_base; + /* + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies + * an alignment of our allocation on the order of the size. + */ + size = NVHE_STACK_SIZE * 2; + addr = ALIGN(__io_map_base, size); + + ret = __pkvm_alloc_private_va_range(addr, size); + if (!ret) { + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. + */ + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + NVHE_STACK_SIZE, + NVHE_STACK_SIZE, phys, PAGE_HYP); + if (ret) + __io_map_base = prev_base; + } + hyp_spin_unlock(&pkvm_pgd_lock); + + *haddr = addr + size; + + return ret; +} + static void *admit_host_page(void *arg) { struct kvm_hyp_memcache *host_mc = arg; @@ -359,7 +489,7 @@ static void *admit_host_page(void *arg) return pop_hyp_memcache(host_mc, hyp_phys_to_virt); } -/* Refill our local memcache by poping pages from the one provided by the host. */ +/* Refill our local memcache by popping pages from the one provided by the host. */ int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc) { diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index 803ba3222e75..a1eb27a1a747 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -32,7 +32,7 @@ u64 __hyp_vmemmap; */ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { phys_addr_t addr = hyp_page_to_phys(p); @@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, /* Find a buddy page currently available for allocation */ static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); @@ -94,7 +94,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, struct hyp_page *p) { phys_addr_t phys = hyp_page_to_phys(p); - unsigned short order = p->order; + u8 order = p->order; struct hyp_page *buddy; memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); @@ -110,7 +110,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, * after coalescing, so make sure to mark it HYP_NO_ORDER proactively. */ p->order = HYP_NO_ORDER; - for (; (order + 1) < pool->max_order; order++) { + for (; (order + 1) <= pool->max_order; order++) { buddy = __find_buddy_avail(pool, p, order); if (!buddy) break; @@ -129,7 +129,7 @@ insert: static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy; @@ -183,7 +183,7 @@ void hyp_get_page(struct hyp_pool *pool, void *addr) void hyp_split_page(struct hyp_page *p) { - unsigned short order = p->order; + u8 order = p->order; unsigned int i; p->order = 0; @@ -195,17 +195,17 @@ void hyp_split_page(struct hyp_page *p) } } -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order) { - unsigned short i = order; struct hyp_page *p; + u8 i = order; hyp_spin_lock(&pool->lock); /* Look for a high-enough-order page */ - while (i < pool->max_order && list_empty(&pool->free_area[i])) + while (i <= pool->max_order && list_empty(&pool->free_area[i])) i++; - if (i >= pool->max_order) { + if (i > pool->max_order) { hyp_spin_unlock(&pool->lock); return NULL; } @@ -228,8 +228,9 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, int i; hyp_spin_lock_init(&pool->lock); - pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT)); - for (i = 0; i < pool->max_order; i++) + pool->max_order = min(MAX_PAGE_ORDER, + get_order(nr_pages << PAGE_SHIFT)); + for (i = 0; i <= pool->max_order; i++) INIT_LIST_HEAD(&pool->free_area[i]); pool->range_start = phys; pool->range_end = phys + (nr_pages << PAGE_SHIFT); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index a06ece14a6d8..8911338961c5 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -6,191 +6,185 @@ #include <linux/kvm_host.h> #include <linux/mm.h> -#include <nvhe/fixed_config.h> + +#include <asm/kvm_emulate.h> + #include <nvhe/mem_protect.h> #include <nvhe/memory.h> #include <nvhe/pkvm.h> #include <nvhe/trap_handler.h> -/* Used by icache_is_vpipt(). */ +/* Used by icache_is_aliasing(). */ unsigned long __icache_flags; /* Used by kvm_get_vttbr(). */ unsigned int kvm_arm_vmid_bits; +unsigned int kvm_host_sve_max_vl; + /* - * Set trap register values based on features in ID_AA64PFR0. + * The currently loaded hyp vCPU for each physical CPU. Used in protected mode + * for both protected and non-protected VMs. */ -static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) +static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); + +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - u64 hcr_set = HCR_RW; - u64 hcr_clear = 0; - u64 cptr_set = 0; + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; - /* Protected KVM does not support AArch32 guests. */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY); - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_ELx_64BIT_ONLY); + if (has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; - /* - * Linux guests assume support for floating-point and Advanced SIMD. Do - * not change the trapping behavior for these from the KVM default. - */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), - PVM_ID_AA64PFR0_ALLOW)); - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), - PVM_ID_AA64PFR0_ALLOW)); - - /* Trap RAS unless all current versions are supported */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < - ID_AA64PFR0_EL1_RAS_V1P1) { - hcr_set |= HCR_TERR | HCR_TEA; - hcr_clear |= HCR_FIEN; + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; } - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) { - hcr_clear |= HCR_AMVOFFEN; - cptr_set |= CPTR_EL2_TAM; - } + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; - /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) - cptr_set |= CPTR_EL2_TZ; + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; } -/* - * Set trap register values based on features in ID_AA64PFR1. - */ -static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) +static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - u64 hcr_set = 0; - u64 hcr_clear = 0; + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.hcr_el2; - /* Memory Tagging: Trap and Treat as Untagged if not supported. */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) { - hcr_set |= HCR_TID5; - hcr_clear |= HCR_DCT | HCR_ATA; - } + /* No support for AArch32. */ + val |= HCR_RW; - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; -} + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; -/* - * Set trap register values based on features in ID_AA64DFR0. - */ -static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); - u64 mdcr_set = 0; - u64 mdcr_clear = 0; - u64 cptr_set = 0; - - /* Trap/constrain PMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; - mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | - MDCR_EL2_HPMN_MASK; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { + val |= HCR_TERR | HCR_TEA; + val &= ~(HCR_FIEN); } - /* Trap Debug */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids)) - mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE; - - /* Trap OS Double Lock */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids)) - mdcr_set |= MDCR_EL2_TDOSA; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) + val &= ~(HCR_AMVOFFEN); - /* Trap SPE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPMS; - mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) { + val |= HCR_TID5; + val &= ~(HCR_DCT | HCR_ATA); } - /* Trap Trace Filter */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids)) - mdcr_set |= MDCR_EL2_TTRF; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) + val |= HCR_TLOR; - /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) - cptr_set |= CPTR_EL2_TTA; - - vcpu->arch.mdcr_el2 |= mdcr_set; - vcpu->arch.mdcr_el2 &= ~mdcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; + vcpu->arch.hcr_el2 = val; } -/* - * Set trap register values based on features in ID_AA64MMFR0. - */ -static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) +static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); - u64 mdcr_set = 0; + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.mdcr_el2; - /* Trap Debug Communications Channel registers */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids)) - mdcr_set |= MDCR_EL2_TDCC; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) { + val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); + } - vcpu->arch.mdcr_el2 |= mdcr_set; -} + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP)) + val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; -/* - * Set trap register values based on features in ID_AA64MMFR1. - */ -static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); - u64 hcr_set = 0; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) + val |= MDCR_EL2_TDOSA; + + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) { + val |= MDCR_EL2_TPMS; + val &= ~MDCR_EL2_E2PB_MASK; + } + + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + val |= MDCR_EL2_TTRF; - /* Trap LOR */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids)) - hcr_set |= HCR_TLOR; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP)) + val |= MDCR_EL2_E2TB_MASK; + + /* Trap Debug Communications Channel registers */ + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + val |= MDCR_EL2_TDCC; - vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.mdcr_el2 = val; } /* - * Set baseline trap register values. + * Check that cpu features that are neither trapped nor supported are not + * enabled for protected VMs. */ -static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) +static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { - const u64 hcr_trap_feat_regs = HCR_TID3; - const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1; + struct kvm *kvm = vcpu->kvm; + + /* No AArch32 support for protected guests. */ + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) + return -EINVAL; /* - * Always trap: - * - Feature id registers: to control features exposed to guests - * - Implementation-defined features + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. */ - vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) || + !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP)) + return -EINVAL; - /* Clear res0 and set res1 bits to trap potential new features. */ - vcpu->arch.hcr_el2 &= ~(HCR_RES0); - vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); - vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; - vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); + /* No SME support in KVM right now. Check to catch if it changes. */ + if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) + return -EINVAL; + + return 0; } /* - * Initialize trap register values for protected VMs. + * Initialize trap register values in protected mode. */ -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) { - pvm_init_trap_regs(vcpu); - pvm_init_traps_aa64pfr0(vcpu); - pvm_init_traps_aa64pfr1(vcpu); - pvm_init_traps_aa64dfr0(vcpu); - pvm_init_traps_aa64mmfr0(vcpu); - pvm_init_traps_aa64mmfr1(vcpu); + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + int ret; + + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); + + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) { + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + /* Trust the host for non-protected vcpu features. */ + vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2; + memcpy(vcpu->arch.fgt, host_vcpu->arch.fgt, sizeof(vcpu->arch.fgt)); + return 0; + } + + ret = pkvm_check_pvm_cpu_features(vcpu); + if (ret) + return ret; + + pvm_init_traps_hcr(vcpu); + pvm_init_traps_mdcr(vcpu); + vcpu_set_hcrx(vcpu); + + return 0; } /* @@ -199,6 +193,11 @@ void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) */ #define HANDLE_OFFSET 0x1000 +/* + * Marks a reserved but not yet used entry in the VM table. + */ +#define RESERVED_ENTRY ((void *)0xa110ca7ed) + static unsigned int vm_handle_to_idx(pkvm_handle_t handle) { return handle - HANDLE_OFFSET; @@ -211,14 +210,14 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx) /* * Spinlock for protecting state related to the VM table. Protects writes - * to 'vm_table' and 'nr_table_entries' as well as reads and writes to - * 'last_hyp_vcpu_lookup'. + * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization. + * Also protects reads and writes to 'last_hyp_vcpu_lookup'. */ -static DEFINE_HYP_SPINLOCK(vm_table_lock); +DEFINE_HYP_SPINLOCK(vm_table_lock); /* - * The table of VM entries for protected VMs in hyp. - * Allocated at hyp initialization and setup. + * A table that tracks all VMs in protected mode. + * Allocated during hyp initialization and setup. */ static struct pkvm_hyp_vm **vm_table; @@ -238,6 +237,10 @@ static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) if (unlikely(idx >= KVM_MAX_PVMS)) return NULL; + /* A reserved entry doesn't represent an initialized VM. */ + if (unlikely(vm_table[idx] == RESERVED_ENTRY)) + return NULL; + return vm_table[idx]; } @@ -247,15 +250,32 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, struct pkvm_hyp_vcpu *hyp_vcpu = NULL; struct pkvm_hyp_vm *hyp_vm; + /* Cannot load a new vcpu without putting the old one first. */ + if (__this_cpu_read(loaded_hyp_vcpu)) + return NULL; + hyp_spin_lock(&vm_table_lock); hyp_vm = get_vm_by_handle(handle); - if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) + if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx) goto unlock; hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + if (!hyp_vcpu) + goto unlock; + + /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ + if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { + hyp_vcpu = NULL; + goto unlock; + } + + hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu); hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); unlock: hyp_spin_unlock(&vm_table_lock); + + if (hyp_vcpu) + __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu); return hyp_vcpu; } @@ -264,63 +284,228 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); hyp_spin_lock(&vm_table_lock); + hyp_vcpu->loaded_hyp_vcpu = NULL; + __this_cpu_write(loaded_hyp_vcpu, NULL); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void) +{ + return __this_cpu_read(loaded_hyp_vcpu); + +} + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (hyp_vm) + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); + + return hyp_vm; +} + +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm) +{ + hyp_spin_lock(&vm_table_lock); hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); hyp_spin_unlock(&vm_table_lock); } +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); + + if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) { + put_pkvm_hyp_vm(hyp_vm); + hyp_vm = NULL; + } + + return hyp_vm; +} + +static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) +{ + struct kvm *kvm = &hyp_vm->kvm; + unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); + DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* CTR_EL0 is always under host control, even for protected VMs. */ + hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; + + /* Preserve the vgic model so that GICv3 emulation works */ + hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model; + + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) + set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); + + /* No restrictions for non-protected VMs. */ + if (!kvm_vm_is_protected(kvm)) { + hyp_vm->kvm.arch.flags = host_arch_flags; + + bitmap_copy(kvm->arch.vcpu_features, + host_kvm->arch.vcpu_features, + KVM_VCPU_MAX_FEATURES); + + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags)) + hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1; + + return; + } + + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); + + set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) + set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) + set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) + set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) { + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE); + } + + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, + allowed_features, KVM_VCPU_MAX_FEATURES); +} + static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); } +static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + void *sve_state; + + if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE)) + return; + + sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state); + hyp_unpin_shared_mem(sve_state, + sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu)); +} + static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], unsigned int nr_vcpus) { int i; - for (i = 0; i < nr_vcpus; i++) - unpin_host_vcpu(hyp_vcpus[i]->host_vcpu); + for (i = 0; i < nr_vcpus; i++) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i]; + + if (!hyp_vcpu) + continue; + + unpin_host_vcpu(hyp_vcpu->host_vcpu); + unpin_host_sve_state(hyp_vcpu); + } } static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, - unsigned int nr_vcpus) + unsigned int nr_vcpus, pkvm_handle_t handle) { + struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; + int idx = vm_handle_to_idx(handle); + + hyp_vm->kvm.arch.pkvm.handle = handle; + hyp_vm->host_kvm = host_kvm; hyp_vm->kvm.created_vcpus = nr_vcpus; - hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr; + hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected); + hyp_vm->kvm.arch.pkvm.is_created = true; + hyp_vm->kvm.arch.flags = 0; + pkvm_init_features_from_host(hyp_vm, host_kvm); + + /* VMID 0 is reserved for the host */ + atomic64_set(&mmu->vmid.id, idx + 1); + + mmu->vtcr = host_mmu.arch.mmu.vtcr; + mmu->arch = &hyp_vm->kvm.arch; + mmu->pgt = &hyp_vm->pgt; +} + +static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + unsigned int sve_max_vl; + size_t sve_state_size; + void *sve_state; + int ret = 0; + + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { + vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); + return 0; + } + + /* Limit guest vector length to the maximum supported by the host. */ + sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl); + sve_state_size = sve_state_size_from_vl(sve_max_vl); + sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state)); + + if (!sve_state || !sve_state_size) { + ret = -EINVAL; + goto err; + } + + ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size); + if (ret) + goto err; + + vcpu->arch.sve_state = sve_state; + vcpu->arch.sve_max_vl = sve_max_vl; + + return 0; +err: + clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features); + return ret; } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, struct pkvm_hyp_vm *hyp_vm, - struct kvm_vcpu *host_vcpu, - unsigned int vcpu_idx) + struct kvm_vcpu *host_vcpu) { int ret = 0; if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) return -EBUSY; - if (host_vcpu->vcpu_idx != vcpu_idx) { - ret = -EINVAL; - goto done; - } - hyp_vcpu->host_vcpu = host_vcpu; hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); - hyp_vcpu->vcpu.vcpu_idx = vcpu_idx; + hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx); hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + + ret = pkvm_vcpu_init_traps(hyp_vcpu); + if (ret) + goto done; + + ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); return ret; } -static int find_free_vm_table_entry(struct kvm *host_kvm) +static int find_free_vm_table_entry(void) { int i; @@ -333,15 +518,13 @@ static int find_free_vm_table_entry(struct kvm *host_kvm) } /* - * Allocate a VM table entry and insert a pointer to the new vm. + * Reserve a VM table entry. * - * Return a unique handle to the protected VM on success, + * Return a unique handle to the VM on success, * negative error code on failure. */ -static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, - struct pkvm_hyp_vm *hyp_vm) +static int allocate_vm_table_entry(void) { - struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; int idx; hyp_assert_lock_held(&vm_table_lock); @@ -354,20 +537,57 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, if (unlikely(!vm_table)) return -EINVAL; - idx = find_free_vm_table_entry(host_kvm); - if (idx < 0) + idx = find_free_vm_table_entry(); + if (unlikely(idx < 0)) return idx; - hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx); + vm_table[idx] = RESERVED_ENTRY; - /* VMID 0 is reserved for the host */ - atomic64_set(&mmu->vmid.id, idx + 1); + return idx; +} - mmu->arch = &hyp_vm->kvm.arch; - mmu->pgt = &hyp_vm->pgt; +static int __insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + unsigned int idx; + + hyp_assert_lock_held(&vm_table_lock); + + /* + * Initializing protected state might have failed, yet a malicious + * host could trigger this function. Thus, ensure that 'vm_table' + * exists. + */ + if (unlikely(!vm_table)) + return -EINVAL; + + idx = vm_handle_to_idx(handle); + if (unlikely(idx >= KVM_MAX_PVMS)) + return -EINVAL; + + if (unlikely(vm_table[idx] != RESERVED_ENTRY)) + return -EINVAL; vm_table[idx] = hyp_vm; - return hyp_vm->kvm.arch.pkvm.handle; + + return 0; +} + +/* + * Insert a pointer to the initialized VM into the VM table. + * + * Return 0 on success, or negative error code on failure. + */ +static int insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = __insert_vm_table_entry(handle, hyp_vm); + hyp_spin_unlock(&vm_table_lock); + + return ret; } /* @@ -411,6 +631,7 @@ static void *map_donated_memory(unsigned long host_va, size_t size) static void __unmap_donated_memory(void *va, size_t size) { + kvm_flush_dcache_to_poc(va, size); WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va), PAGE_ALIGN(size) >> PAGE_SHIFT)); } @@ -433,10 +654,45 @@ static void unmap_donated_memory_noclear(void *va, size_t size) } /* - * Initialize the hypervisor copy of the protected VM state using the - * memory donated by the host. + * Reserves an entry in the hypervisor for a new VM in protected mode. * - * Unmaps the donated memory from the host at stage 2. + * Return a unique handle to the VM on success, negative error code on failure. + */ +int __pkvm_reserve_vm(void) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = allocate_vm_table_entry(); + hyp_spin_unlock(&vm_table_lock); + + if (ret < 0) + return ret; + + return idx_to_vm_handle(ret); +} + +/* + * Removes a reserved entry, but only if is hasn't been used yet. + * Otherwise, the VM needs to be destroyed. + */ +void __pkvm_unreserve_vm(pkvm_handle_t handle) +{ + unsigned int idx = vm_handle_to_idx(handle); + + if (unlikely(!vm_table)) + return; + + hyp_spin_lock(&vm_table_lock); + if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY)) + remove_vm_table_entry(handle); + hyp_spin_unlock(&vm_table_lock); +} + +/* + * Initialize the hypervisor copy of the VM state using host-donated memory. + * + * Unmap the donated memory from the host at stage 2. * * host_kvm: A pointer to the host's struct kvm. * vm_hva: The host va of the area being donated for the VM state. @@ -445,8 +701,7 @@ static void unmap_donated_memory_noclear(void *va, size_t size) * the VM. Must be page aligned. Its size is implied by the VM's * VTCR. * - * Return a unique handle to the protected VM on success, - * negative error code on failure. + * Return 0 success, negative error code on failure. */ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva) @@ -454,6 +709,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, struct pkvm_hyp_vm *hyp_vm = NULL; size_t vm_size, pgd_size; unsigned int nr_vcpus; + pkvm_handle_t handle; void *pgd = NULL; int ret; @@ -467,8 +723,14 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, goto err_unpin_kvm; } + handle = READ_ONCE(host_kvm->arch.pkvm.handle); + if (unlikely(handle < HANDLE_OFFSET)) { + ret = -EINVAL; + goto err_unpin_kvm; + } + vm_size = pkvm_get_hyp_vm_size(nr_vcpus); - pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr); + pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr); ret = -ENOMEM; @@ -480,24 +742,19 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, if (!pgd) goto err_remove_mappings; - init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus); - - hyp_spin_lock(&vm_table_lock); - ret = insert_vm_table_entry(host_kvm, hyp_vm); - if (ret < 0) - goto err_unlock; + init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle); ret = kvm_guest_prepare_stage2(hyp_vm, pgd); if (ret) - goto err_remove_vm_table_entry; - hyp_spin_unlock(&vm_table_lock); + goto err_remove_mappings; - return hyp_vm->kvm.arch.pkvm.handle; + /* Must be called last since this publishes the VM. */ + ret = insert_vm_table_entry(handle, hyp_vm); + if (ret) + goto err_remove_mappings; + + return 0; -err_remove_vm_table_entry: - remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle); -err_unlock: - hyp_spin_unlock(&vm_table_lock); err_remove_mappings: unmap_donated_memory(hyp_vm, vm_size); unmap_donated_memory(pgd, pgd_size); @@ -507,10 +764,9 @@ err_unpin_kvm: } /* - * Initialize the hypervisor copy of the protected vCPU state using the - * memory donated by the host. + * Initialize the hypervisor copy of the vCPU state using host-donated memory. * - * handle: The handle for the protected vm. + * handle: The hypervisor handle for the vm. * host_vcpu: A pointer to the corresponding host vcpu. * vcpu_hva: The host va of the area being donated for the vcpu state. * Must be page aligned. The size of the area must be equal to @@ -537,24 +793,27 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, goto unlock; } - idx = hyp_vm->nr_vcpus; + ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu); + if (ret) + goto unlock; + + idx = hyp_vcpu->vcpu.vcpu_idx; if (idx >= hyp_vm->kvm.created_vcpus) { ret = -EINVAL; goto unlock; } - ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx); - if (ret) + if (hyp_vm->vcpus[idx]) { + ret = -EINVAL; goto unlock; + } hyp_vm->vcpus[idx] = hyp_vcpu; - hyp_vm->nr_vcpus++; unlock: hyp_spin_unlock(&vm_table_lock); if (ret) unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); - return ret; } @@ -572,7 +831,7 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) int __pkvm_teardown_vm(pkvm_handle_t handle) { - struct kvm_hyp_memcache *mc; + struct kvm_hyp_memcache *mc, *stage2_mc; struct pkvm_hyp_vm *hyp_vm; struct kvm *host_kvm; unsigned int idx; @@ -600,12 +859,26 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) /* Reclaim guest pages (including page-table pages) */ mc = &host_kvm->arch.pkvm.teardown_mc; - reclaim_guest_pages(hyp_vm, mc); - unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); + stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc; + reclaim_pgtable_pages(hyp_vm, stage2_mc); + unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus); /* Push the metadata pages to the teardown memcache */ - for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { + for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) { struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; + struct kvm_hyp_memcache *vcpu_mc; + + if (!hyp_vcpu) + continue; + + vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; + + while (vcpu_mc->nr_pages) { + void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); + + push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys); + unmap_donated_memory_noclear(addr, PAGE_SIZE); + } teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); } diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 08508783ec3d..c3e196fb8b18 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -200,12 +200,12 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) __hyp_pa(init_params), 0); } -asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on) +asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) { struct psci_boot_args *boot_args; struct kvm_cpu_context *host_ctxt; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); if (is_cpu_on) boot_args = this_cpu_ptr(&cpu_on_args); @@ -218,6 +218,9 @@ asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on) if (is_cpu_on) release_boot_args(boot_args); + write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR); + write_sysreg(INIT_PSTATE_EL1, SPSR_EL2); + __host_enter(host_ctxt); } @@ -265,6 +268,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ case PSCI_1_0_FN_PSCI_FEATURES: case PSCI_1_0_FN_SET_SUSPEND_MODE: case PSCI_1_1_FN64_SYSTEM_RESET2: + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: return psci_forward(host_ctxt); case PSCI_1_0_FN64_SYSTEM_SUSPEND: return psci_system_suspend(func_id, host_ctxt); @@ -273,9 +278,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ } } -bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt) +bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) { - DECLARE_REG(u64, func_id, host_ctxt, 0); unsigned long ret; switch (kvm_host_psci_config.version) { diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 110f04627785..90bd014e952f 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -11,7 +11,7 @@ #include <asm/kvm_pkvm.h> #include <nvhe/early_alloc.h> -#include <nvhe/fixed_config.h> +#include <nvhe/ffa.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> #include <nvhe/mem_protect.h> @@ -28,6 +28,8 @@ static void *vmemmap_base; static void *vm_table_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; +static void *selftest_base; +static void *ffa_proxy_pages; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; static struct hyp_pool hpool; @@ -37,6 +39,11 @@ static int divide_memory_pool(void *virt, unsigned long size) hyp_early_alloc_init(virt, size); + nr_pages = pkvm_selftest_pages(); + selftest_base = hyp_early_alloc_contig(nr_pages); + if (nr_pages && !selftest_base) + return -ENOMEM; + nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page)); vmemmap_base = hyp_early_alloc_contig(nr_pages); if (!vmemmap_base) @@ -57,6 +64,33 @@ static int divide_memory_pool(void *virt, unsigned long size) if (!host_s2_pgt_base) return -ENOMEM; + nr_pages = hyp_ffa_proxy_pages(); + ffa_proxy_pages = hyp_early_alloc_contig(nr_pages); + if (!ffa_proxy_pages) + return -ENOMEM; + + return 0; +} + +static int pkvm_create_host_sve_mappings(void) +{ + void *start, *end; + int ret, i; + + if (!system_supports_sve()) + return 0; + + for (i = 0; i < hyp_nr_cpus; i++) { + struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i); + struct cpu_sve_state *sve_state = host_data->sve_state; + + start = kern_hyp_va(sve_state); + end = start + PAGE_ALIGN(pkvm_host_sve_state_size()); + ret = pkvm_create_mappings(start, end, PAGE_HYP); + if (ret) + return ret; + } + return 0; } @@ -66,7 +100,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, { void *start, *end, *virt = hyp_phys_to_virt(phys); unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; - enum kvm_pgtable_prot prot; int ret, i; /* Recreate the hyp page-table using the early page allocator */ @@ -92,6 +125,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; + ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP); + if (ret) + return ret; + ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); if (ret) return ret; @@ -106,7 +143,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, for (i = 0; i < hyp_nr_cpus; i++) { struct kvm_nvhe_init_params *params = per_cpu_ptr(&kvm_init_params, i); - unsigned long hyp_addr; start = (void *)kern_hyp_va(per_cpu_base[i]); end = start + PAGE_ALIGN(hyp_percpu_size); @@ -114,51 +150,12 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - /* - * Allocate a contiguous HYP private VA range for the stack - * and guard page. The allocation is also aligned based on - * the order of its size. - */ - ret = pkvm_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); - if (ret) - return ret; - - /* - * Since the stack grows downwards, map the stack to the page - * at the higher address and leave the lower guard page - * unbacked. - * - * Any valid stack address now has the PAGE_SHIFT bit as 1 - * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. - */ - hyp_spin_lock(&pkvm_pgd_lock); - ret = kvm_pgtable_hyp_map(&pkvm_pgtable, hyp_addr + PAGE_SIZE, - PAGE_SIZE, params->stack_pa, PAGE_HYP); - hyp_spin_unlock(&pkvm_pgd_lock); + ret = pkvm_create_stack(params->stack_pa, ¶ms->stack_hyp_va); if (ret) return ret; - - /* Update stack_hyp_va to end of the stack's private VA range */ - params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); } - /* - * Map the host sections RO in the hypervisor, but transfer the - * ownership from the host to the hypervisor itself to make sure they - * can't be donated or shared with another entity. - * - * The ownership transition requires matching changes in the host - * stage-2. This will be done later (see finalize_host_mappings()) once - * the hyp_vmemmap is addressable. - */ - prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); - ret = pkvm_create_mappings(&kvm_vgic_global_state, - &kvm_vgic_global_state + 1, prot); - if (ret) - return ret; - - return 0; + return pkvm_create_host_sve_mappings(); } static void update_nvhe_init_params(void) @@ -192,39 +189,53 @@ static void hpool_put_page(void *addr) static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - enum kvm_pgtable_prot prot; enum pkvm_page_state state; + struct hyp_page *page; phys_addr_t phys; + enum kvm_pgtable_prot prot; if (!kvm_pte_valid(ctx->old)) return 0; - if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1)) + if (ctx->level != KVM_PGTABLE_LAST_LEVEL) return -EINVAL; phys = kvm_pte_to_phys(ctx->old); if (!addr_is_memory(phys)) return -EINVAL; + page = hyp_phys_to_page(phys); + /* * Adjust the host stage-2 mappings to match the ownership attributes - * configured in the hypervisor stage-1. + * configured in the hypervisor stage-1, and make sure to propagate them + * to the hyp_vmemmap state. */ - state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old)); + prot = kvm_pgtable_hyp_pte_prot(ctx->old); + state = pkvm_getstate(prot); switch (state) { case PKVM_PAGE_OWNED: - return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); + set_hyp_state(page, PKVM_PAGE_OWNED); + /* hyp text is RO in the host stage-2 to be inspected on panic. */ + if (prot == PAGE_HYP_EXEC) { + set_host_state(page, PKVM_NOPAGE); + return host_stage2_idmap_locked(phys, PAGE_SIZE, KVM_PGTABLE_PROT_R); + } else { + return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); + } case PKVM_PAGE_SHARED_OWNED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); + set_hyp_state(page, PKVM_PAGE_SHARED_OWNED); + set_host_state(page, PKVM_PAGE_SHARED_BORROWED); break; case PKVM_PAGE_SHARED_BORROWED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + set_hyp_state(page, PKVM_PAGE_SHARED_BORROWED); + set_host_state(page, PKVM_PAGE_SHARED_OWNED); break; default: return -EINVAL; } - return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); + return 0; } static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -275,8 +286,7 @@ static int fix_hyp_pgtable_refcnt(void) void __noreturn __pkvm_init_finalise(void) { - struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); - struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt; + struct kvm_cpu_context *host_ctxt = host_data_ptr(host_ctxt); unsigned long nr_pages, reserved_pages, pfn; int ret; @@ -310,11 +320,17 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; - ret = hyp_create_pcpu_fixmap(); + ret = hyp_create_fixmap(); + if (ret) + goto out; + + ret = hyp_ffa_init(ffa_proxy_pages); if (ret) goto out; pkvm_hyp_vm_table_init(vm_table_base); + + pkvm_ownership_selftest(selftest_base); out: /* * We tail-called to here from handle___pkvm_init() and will not return, @@ -330,7 +346,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, { struct kvm_nvhe_init_params *params; void *virt = hyp_phys_to_virt(phys); - void (*fn)(phys_addr_t params_pa, void *finalize_fn_va); + typeof(__pkvm_init_switch_pgd) *fn; int ret; BUG_ON(kvm_check_pvm_sysreg_table()); @@ -354,7 +370,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, /* Jump in the idmap page to switch to the new page-tables */ params = this_cpu_ptr(&kvm_init_params); fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd); - fn(__hyp_pa(params), __pkvm_init_finalise); + fn(params->pgd_pa, params->stack_hyp_va, __pkvm_init_finalise); unreachable(); } diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index ed6b58b19cfa..5b6eeab1a774 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -28,7 +28,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); - stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE); + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - NVHE_STACK_SIZE); stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); stacktrace_info->fp = fp; stacktrace_info->pc = pc; @@ -54,7 +54,7 @@ static struct stack_info stackinfo_get_hyp(void) { struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); unsigned long high = params->stack_hyp_va; - unsigned long low = high - PAGE_SIZE; + unsigned long low = high - NVHE_STACK_SIZE; return (struct stack_info) { .low = low, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index c2cb46ca4fb6..d3b9ec8a7c28 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -26,7 +26,6 @@ #include <asm/debug-monitors.h> #include <asm/processor.h> -#include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> /* Non-VHE specific context */ @@ -34,25 +33,30 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); +struct fgt_masks hfgrtr_masks; +struct fgt_masks hfgwtr_masks; +struct fgt_masks hfgitr_masks; +struct fgt_masks hdfgrtr_masks; +struct fgt_masks hdfgwtr_masks; +struct fgt_masks hafgrtr_masks; +struct fgt_masks hfgrtr2_masks; +struct fgt_masks hfgwtr2_masks; +struct fgt_masks hfgitr2_masks; +struct fgt_masks hdfgrtr2_masks; +struct fgt_masks hdfgwtr2_masks; + extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); static void __activate_traps(struct kvm_vcpu *vcpu) { - u64 val; + ___activate_traps(vcpu, vcpu->arch.hcr_el2); - ___activate_traps(vcpu); - __activate_traps_common(vcpu); + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - val = vcpu->arch.cptr_el2; - val |= CPTR_EL2_TTA | CPTR_EL2_TAM; - if (!guest_owns_fp_regs(vcpu)) { - val |= CPTR_EL2_TFP | CPTR_EL2_TZ; - __activate_traps_fpsimd32(vcpu); - } - if (cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; + __activate_traps_common(vcpu); + __activate_cptr_traps(vcpu); - write_sysreg(val, cptr_el2); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { @@ -73,7 +77,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { extern char __kvm_hyp_host_vector[]; - u64 cptr; ___deactivate_traps(vcpu); @@ -94,17 +97,13 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } - __deactivate_traps_common(vcpu); + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); - write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); + __deactivate_traps_common(vcpu); - cptr = CPTR_EL2_DEFAULT; - if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)) - cptr |= CPTR_EL2_TZ; - if (cpus_have_final_cap(ARM64_SME)) - cptr &= ~CPTR_EL2_TSM; + write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2); - write_sysreg(cptr, cptr_el2); + __deactivate_cptr_traps(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); } @@ -170,9 +169,8 @@ static void __pmu_switch_to_host(struct kvm_vcpu *vcpu) static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) { /* - * Make sure we handle the exit for workarounds and ptrauth - * before the pKVM handling, as the latter could decide to - * UNDEF. + * Make sure we handle the exit for workarounds before the pKVM + * handling, as the latter could decide to UNDEF. */ return (kvm_hyp_handle_sysreg(vcpu, exit_code) || kvm_handle_pvm_sysreg(vcpu, exit_code)); @@ -186,7 +184,8 @@ static const exit_handler_fn hyp_exit_handlers[] = { [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, - [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, + [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, + [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, }; static const exit_handler_fn pvm_exit_handlers[] = { @@ -196,33 +195,34 @@ static const exit_handler_fn pvm_exit_handlers[] = { [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, - [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, + [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, + [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, }; static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) { - if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)))) + if (unlikely(vcpu_is_protected(vcpu))) return pvm_exit_handlers; return hyp_exit_handlers; } -/* - * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. - * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a - * guest from dropping to AArch32 EL0 if implemented by the CPU. If the - * hypervisor spots a guest in such a state ensure it is handled, and don't - * trust the host to spot or fix it. The check below is based on the one in - * kvm_arch_vcpu_ioctl_run(). - * - * Returns false if the guest ran in AArch32 when it shouldn't have, and - * thus should exit to the host, or true if a the guest run loop can continue. - */ -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { - struct kvm *kvm = kern_hyp_va(vcpu->kvm); + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + + synchronize_vcpu_pstate(vcpu, exit_code); - if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) { + /* + * Some guests (e.g., protected VMs) are not be allowed to run in + * AArch32. The ARMv8 architecture does not give the hypervisor a + * mechanism to prevent a guest from dropping to AArch32 EL0 if + * implemented by the CPU. If the hypervisor spots a guest in such a + * state ensure it is handled, and don't trust the host to spot or fix + * it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + */ + if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { /* * As we have caught the guest red-handed, decide that it isn't * fit for purpose anymore by making the vcpu invalid. The VMM @@ -230,10 +230,12 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) * KVM_ARM_VCPU_INIT, however, this is likely not possible for * protected VMs. */ - vcpu->arch.target = -1; + vcpu_clear_flag(vcpu, VCPU_INITIALIZED); *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); *exit_code |= ARM_EXCEPTION_IL; } + + return __fixup_guest_exit(vcpu, exit_code, handlers); } /* Switch to the guest for legacy non-VHE systems */ @@ -256,7 +258,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) pmr_sync(); } - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; @@ -272,6 +274,17 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ __debug_save_host_buffers_nvhe(vcpu); + /* + * We're about to restore some new MMU state. Make sure + * ongoing page-table walks that have started before we + * trapped to EL2 have completed. This also synchronises the + * above disabling of BRBE, SPE and TRBE. + * + * See DDI0487I.a D8.1.5 "Out-of-context translation regimes", + * rule R_LFHQG and subsequent information statements. + */ + dsb(nsh); + __kvm_adjust_pc(vcpu); /* @@ -306,12 +319,19 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __timer_disable_traps(vcpu); __hyp_vgic_save_state(vcpu); + /* + * Same thing as before the guest run: we're about to switch + * the MMU context, so let's make sure we don't have any + * ongoing EL1&0 translations. + */ + dsb(nsh); + __deactivate_traps(vcpu); __load_host_stage2(); __sysreg_restore_state_nvhe(host_ctxt); - if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) + if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); @@ -341,7 +361,7 @@ asmlinkage void __noreturn hyp_panic(void) struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; if (vcpu) { diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 0f9ac25afdf4..3108b5185c20 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -11,7 +11,7 @@ #include <hyp/adjust_pc.h> -#include <nvhe/fixed_config.h> +#include <nvhe/pkvm.h> #include "../../sys_regs.h" @@ -26,230 +26,258 @@ u64 id_aa64isar2_el1_sys_val; u64 id_aa64mmfr0_el1_sys_val; u64 id_aa64mmfr1_el1_sys_val; u64 id_aa64mmfr2_el1_sys_val; +u64 id_aa64smfr0_el1_sys_val; + +struct pvm_ftr_bits { + bool sign; + u8 shift; + u8 width; + u8 max_val; + bool (*vm_supported)(const struct kvm *kvm); +}; -/* - * Inject an unknown/undefined exception to an AArch64 guest while most of its - * sysregs are live. - */ -static void inject_undef64(struct kvm_vcpu *vcpu) -{ - u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - - *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); - *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); - - kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); - - __kvm_adjust_pc(vcpu); - - write_sysreg_el1(esr, SYS_ESR); - write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); - write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); - write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); -} - -/* - * Returns the restricted features values of the feature register based on the - * limitations in restrict_fields. - * A feature id field value of 0b0000 does not impose any restrictions. - * Note: Use only for unsigned feature field values. - */ -static u64 get_restricted_features_unsigned(u64 sys_reg_val, - u64 restrict_fields) -{ - u64 value = 0UL; - u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); - - /* - * According to the Arm Architecture Reference Manual, feature fields - * use increasing values to indicate increases in functionality. - * Iterate over the restricted feature fields and calculate the minimum - * unsigned value between the one supported by the system, and what the - * value is being restricted to. - */ - while (sys_reg_val && restrict_fields) { - value |= min(sys_reg_val & mask, restrict_fields & mask); - sys_reg_val &= ~mask; - restrict_fields &= ~mask; - mask <<= ARM64_FEATURE_FIELD_BITS; +#define __MAX_FEAT_FUNC(id, fld, max, func, sgn) \ + { \ + .sign = sgn, \ + .shift = id##_##fld##_SHIFT, \ + .width = id##_##fld##_WIDTH, \ + .max_val = id##_##fld##_##max, \ + .vm_supported = func, \ } - return value; -} - -/* - * Functions that return the value of feature id registers for protected VMs - * based on allowed features, system features, and KVM support. - */ +#define MAX_FEAT_FUNC(id, fld, max, func) \ + __MAX_FEAT_FUNC(id, fld, max, func, id##_##fld##_SIGNED) -static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) -{ - const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); - u64 set_mask = 0; - u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; +#define MAX_FEAT(id, fld, max) \ + MAX_FEAT_FUNC(id, fld, max, NULL) - set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); +#define MAX_FEAT_ENUM(id, fld, max) \ + __MAX_FEAT_FUNC(id, fld, max, NULL, false) - /* Spectre and Meltdown mitigation in KVM */ - set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), - (u64)kvm->arch.pfr0_csv2); - set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), - (u64)kvm->arch.pfr0_csv3); +#define FEAT_END { .width = 0, } - return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; -} - -static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) +static bool vm_has_ptrauth(const struct kvm *kvm) { - const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); - u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; - - if (!kvm_has_mte(kvm)) - allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); - - return id_aa64pfr1_el1_sys_val & allow_mask; -} - -static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for Scalable Vectors, therefore, hyp has no sanitized - * copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, including breakpoints, and watchpoints, - * therefore, pKVM has no sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, therefore, hyp has no sanitized copy of the - * feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); - return 0; -} + if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) + return false; -static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); - return 0; + return (cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || + cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC); } -static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) +static bool vm_has_sve(const struct kvm *kvm) { - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); - return 0; + return system_supports_sve() && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_SVE); } -static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) -{ - return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; -} +/* + * Definitions for features to be allowed or restricted for protected guests. + * + * Each field in the masks represents the highest supported value for the + * feature. If a feature field is not present, it is not supported. Moreover, + * these are used to generate the guest's view of the feature registers. + * + * The approach for protected VMs is to at least support features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ -static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64pfr0[] = { + MAX_FEAT(ID_AA64PFR0_EL1, EL0, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL1, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL3, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, FP, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, AdvSIMD, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, GIC, IMP), + MAX_FEAT_FUNC(ID_AA64PFR0_EL1, SVE, IMP, vm_has_sve), + MAX_FEAT(ID_AA64PFR0_EL1, RAS, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, DIT, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV3, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); +static const struct pvm_ftr_bits pvmid_aa64pfr1[] = { + MAX_FEAT(ID_AA64PFR1_EL1, BT, IMP), + MAX_FEAT(ID_AA64PFR1_EL1, SSBS, SSBS2), + MAX_FEAT_ENUM(ID_AA64PFR1_EL1, MTE_frac, NI), + FEAT_END +}; - return id_aa64isar1_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = { + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40), + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGEND, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, SNSMEM, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGENDEL0, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, EXS, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64mmfr1[] = { + MAX_FEAT(ID_AA64MMFR1_EL1, HAFDBS, DBM), + MAX_FEAT_ENUM(ID_AA64MMFR1_EL1, VMIDBits, 16), + MAX_FEAT(ID_AA64MMFR1_EL1, HPDS, HPDS2), + MAX_FEAT(ID_AA64MMFR1_EL1, PAN, PAN3), + MAX_FEAT(ID_AA64MMFR1_EL1, SpecSEI, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, ETS, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, CMOW, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); +static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = { + MAX_FEAT(ID_AA64MMFR2_EL1, CnP, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP), + MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18), + MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2), + MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP), + FEAT_END +}; - return id_aa64isar2_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64isar1[] = { + MAX_FEAT(ID_AA64ISAR1_EL1, DPB, DPB2), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, APA, PAuth, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, API, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR1_EL1, JSCVT, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FCMA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, LRCPC, LRCPC3), + MAX_FEAT(ID_AA64ISAR1_EL1, GPA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, GPI, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FRINTTS, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SB, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX), + MAX_FEAT(ID_AA64ISAR1_EL1, BF16, EBF16), + MAX_FEAT(ID_AA64ISAR1_EL1, DGH, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, I8MM, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask; +static const struct pvm_ftr_bits pvmid_aa64isar2[] = { + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, GPA3, IMP, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, APA3, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR2_EL1, ATS1A, IMP), + FEAT_END +}; - set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, - PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); +/* + * None of the features in ID_AA64DFR0_EL1 nor ID_AA64MMFR4_EL1 are supported. + * However, both have Not-Implemented values that are non-zero. Define them + * so they can be used when getting the value of these registers. + */ +#define ID_AA64DFR0_EL1_NONZERO_NI \ +( \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI) | \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, MTPMU, NI) \ +) - return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; -} +#define ID_AA64MMFR4_EL1_NONZERO_NI \ + SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI) -static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) +/* + * Returns the value of the feature registers based on the system register + * value, the vcpu support for the revelant features, and the additional + * restrictions for protected VMs. + */ +static u64 get_restricted_features(const struct kvm_vcpu *vcpu, + u64 sys_reg_val, + const struct pvm_ftr_bits restrictions[]) { - return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; -} + u64 val = 0UL; + int i; + + for (i = 0; restrictions[i].width != 0; i++) { + bool (*vm_supported)(const struct kvm *) = restrictions[i].vm_supported; + bool sign = restrictions[i].sign; + int shift = restrictions[i].shift; + int width = restrictions[i].width; + u64 min_signed = (1UL << width) - 1UL; + u64 sign_bit = 1UL << (width - 1); + u64 mask = GENMASK_ULL(width + shift - 1, shift); + u64 sys_val = (sys_reg_val & mask) >> shift; + u64 pvm_max = restrictions[i].max_val; + + if (vm_supported && !vm_supported(vcpu->kvm)) + val |= (sign ? min_signed : 0) << shift; + else if (sign && (sys_val >= sign_bit || pvm_max >= sign_bit)) + val |= max(sys_val, pvm_max) << shift; + else + val |= min(sys_val, pvm_max) << shift; + } -static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) -{ - return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; + return val; } -/* Read a sanitized cpufeature ID register by its encoding */ -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) { switch (id) { case SYS_ID_AA64PFR0_EL1: - return get_pvm_id_aa64pfr0(vcpu); + return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0); case SYS_ID_AA64PFR1_EL1: - return get_pvm_id_aa64pfr1(vcpu); - case SYS_ID_AA64ZFR0_EL1: - return get_pvm_id_aa64zfr0(vcpu); - case SYS_ID_AA64DFR0_EL1: - return get_pvm_id_aa64dfr0(vcpu); - case SYS_ID_AA64DFR1_EL1: - return get_pvm_id_aa64dfr1(vcpu); - case SYS_ID_AA64AFR0_EL1: - return get_pvm_id_aa64afr0(vcpu); - case SYS_ID_AA64AFR1_EL1: - return get_pvm_id_aa64afr1(vcpu); + return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1); case SYS_ID_AA64ISAR0_EL1: - return get_pvm_id_aa64isar0(vcpu); + return id_aa64isar0_el1_sys_val; case SYS_ID_AA64ISAR1_EL1: - return get_pvm_id_aa64isar1(vcpu); + return get_restricted_features(vcpu, id_aa64isar1_el1_sys_val, pvmid_aa64isar1); case SYS_ID_AA64ISAR2_EL1: - return get_pvm_id_aa64isar2(vcpu); + return get_restricted_features(vcpu, id_aa64isar2_el1_sys_val, pvmid_aa64isar2); case SYS_ID_AA64MMFR0_EL1: - return get_pvm_id_aa64mmfr0(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr0_el1_sys_val, pvmid_aa64mmfr0); case SYS_ID_AA64MMFR1_EL1: - return get_pvm_id_aa64mmfr1(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr1_el1_sys_val, pvmid_aa64mmfr1); case SYS_ID_AA64MMFR2_EL1: - return get_pvm_id_aa64mmfr2(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr2_el1_sys_val, pvmid_aa64mmfr2); + case SYS_ID_AA64DFR0_EL1: + return ID_AA64DFR0_EL1_NONZERO_NI; + case SYS_ID_AA64MMFR4_EL1: + return ID_AA64MMFR4_EL1_NONZERO_NI; default: /* Unhandled ID register, RAZ */ return 0; } } +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR)); + + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); +} + static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { - return pvm_read_id_reg(vcpu, reg_to_encoding(r)); + struct kvm *kvm = vcpu->kvm; + u32 reg = reg_to_encoding(r); + + if (WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))) + return 0; + + if (reg >= sys_reg(3, 0, 0, 1, 0) && reg <= sys_reg(3, 0, 0, 7, 7)) + return kvm->arch.id_regs[IDREG_IDX(reg)]; + + return 0; } /* Handler to RAZ/WI sysregs */ @@ -277,13 +305,6 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, return false; } - /* - * No support for AArch32 guests, therefore, pKVM has no sanitized copy - * of AArch32 feature id registers. - */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_ELx_64BIT_ONLY); - return pvm_access_raz_wi(vcpu, p, r); } @@ -352,6 +373,9 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Debug and Trace Registers are restricted. */ + /* Group 1 ID registers */ + HOST_HANDLED(SYS_REVIDR_EL1), + /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ AARCH32(SYS_ID_PFR0_EL1), @@ -420,6 +444,8 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Scalable Vector Registers are restricted. */ + HOST_HANDLED(SYS_ICC_PMR_EL1), + RAZ_WI(SYS_ERRIDR_EL1), RAZ_WI(SYS_ERRSELR_EL1), RAZ_WI(SYS_ERXFR_EL1), @@ -433,13 +459,17 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Limited Ordering Regions Registers are restricted. */ + HOST_HANDLED(SYS_ICC_DIR_EL1), + HOST_HANDLED(SYS_ICC_RPR_EL1), HOST_HANDLED(SYS_ICC_SGI1R_EL1), HOST_HANDLED(SYS_ICC_ASGI1R_EL1), HOST_HANDLED(SYS_ICC_SGI0R_EL1), + HOST_HANDLED(SYS_ICC_CTLR_EL1), { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, HOST_HANDLED(SYS_CCSIDR_EL1), HOST_HANDLED(SYS_CLIDR_EL1), + HOST_HANDLED(SYS_AIDR_EL1), HOST_HANDLED(SYS_CSSELR_EL1), HOST_HANDLED(SYS_CTR_EL0), @@ -455,6 +485,30 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { }; /* + * Initializes feature registers for protected vms. + */ +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_arch *ka = &kvm->arch; + u32 r; + + hyp_assert_lock_held(&vm_table_lock); + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + /* + * Initialize only AArch64 id registers since AArch32 isn't supported + * for protected VMs. + */ + for (r = sys_reg(3, 0, 0, 4, 0); r <= sys_reg(3, 0, 0, 7, 7); r += sys_reg(0, 0, 0, 0, 1)) + ka->id_regs[IDREG_IDX(r)] = pvm_calc_id_reg(vcpu, r); + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); +} + +/* * Checks that the sysreg table is unique and in-order. * * Returns 0 if the table is consistent, or 1 otherwise. diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c index 29305022bc04..3cc613cce5f5 100644 --- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -28,7 +28,9 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt) void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) { - __sysreg_restore_el1_state(ctxt); + u64 midr = ctxt_midr_el1(ctxt); + + __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1)); __sysreg_restore_common_state(ctxt); __sysreg_restore_user_state(ctxt); __sysreg_restore_el2_return_state(ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c index 9072e71693ba..ff176f4ce7de 100644 --- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -9,6 +9,7 @@ #include <linux/kvm_host.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> void __kvm_timer_set_cntvoff(u64 cntvoff) { @@ -16,33 +17,54 @@ void __kvm_timer_set_cntvoff(u64 cntvoff) } /* - * Should only be called on non-VHE systems. + * Should only be called on non-VHE or hVHE setups. * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). */ void __timer_disable_traps(struct kvm_vcpu *vcpu) { - u64 val; + u64 set, clr, shift = 0; + + if (has_hvhe()) + shift = 10; /* Allow physical timer/counter access for the host */ - val = read_sysreg(cnthctl_el2); - val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; - write_sysreg(val, cnthctl_el2); + set = (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; + clr = CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + + sysreg_clear_set(cnthctl_el2, clr, set); } /* - * Should only be called on non-VHE systems. + * Should only be called on non-VHE or hVHE setups. * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). */ void __timer_enable_traps(struct kvm_vcpu *vcpu) { - u64 val; + u64 clr = 0, set = 0; /* * Disallow physical timer access for the guest - * Physical counter access is allowed + * Physical counter access is allowed if no offset is enforced + * or running protected (we don't offset anything in this case). */ - val = read_sysreg(cnthctl_el2); - val &= ~CNTHCTL_EL1PCEN; - val |= CNTHCTL_EL1PCTEN; - write_sysreg(val, cnthctl_el2); + clr = CNTHCTL_EL1PCEN; + if (is_protected_kvm_enabled() || + !kern_hyp_va(vcpu->kvm)->arch.timer_data.poffset) + set |= CNTHCTL_EL1PCTEN; + else + clr |= CNTHCTL_EL1PCTEN; + + if (has_hvhe()) { + clr <<= 10; + set <<= 10; + } + + /* + * Trap the virtual counter/timer if we have a broken cntvoff + * implementation. + */ + if (has_broken_cntvoff()) + set |= CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + + sysreg_clear_set(cnthctl_el2, clr, set); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index d296d617f589..48da9ca9763f 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -11,26 +11,94 @@ #include <nvhe/mem_protect.h> struct tlb_inv_context { - u64 tcr; + struct kvm_s2_mmu *mmu; + u64 tcr; + u64 sctlr; }; -static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, - struct tlb_inv_context *cxt) +static void enter_vmid_context(struct kvm_s2_mmu *mmu, + struct tlb_inv_context *cxt, + bool nsh) { + struct kvm_s2_mmu *host_s2_mmu = &host_mmu.arch.mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + cxt->mmu = NULL; + + /* + * We have two requirements: + * + * - ensure that the page table updates are visible to all + * CPUs, for which a dsb(DOMAIN-st) is what we need, DOMAIN + * being either ish or nsh, depending on the invalidation + * type. + * + * - complete any speculative page table walk started before + * we trapped to EL2 so that we can mess with the MM + * registers out of context, for which dsb(nsh) is enough + * + * The composition of these two barriers is a dsb(DOMAIN), and + * the 'nsh' parameter tracks the distinction between + * Inner-Shareable and Non-Shareable, as specified by the + * callers. + */ + if (nsh) + dsb(nsh); + else + dsb(ish); + + /* + * If we're already in the desired context, then there's nothing to do. + */ + if (vcpu) { + /* + * We're in guest context. However, for this to work, this needs + * to be called from within __kvm_vcpu_run(), which ensures that + * __hyp_running_vcpu is set to the current guest vcpu. + */ + if (mmu == vcpu->arch.hw_mmu || WARN_ON(mmu != host_s2_mmu)) + return; + + cxt->mmu = vcpu->arch.hw_mmu; + } else { + /* We're in host context. */ + if (mmu == host_s2_mmu) + return; + + cxt->mmu = host_s2_mmu; + } + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; /* * For CPUs that are affected by ARM 1319367, we need to - * avoid a host Stage-1 walk while we have the guest's - * VMID set in the VTTBR in order to invalidate TLBs. - * We're guaranteed that the S1 MMU is enabled, so we can - * simply set the EPD bits to avoid any further TLB fill. + * avoid a Stage-1 walk with the old VMID while we have + * the new VMID set in the VTTBR in order to invalidate TLBs. + * We're guaranteed that the host S1 MMU is enabled, so + * we can simply set the EPD bits to avoid any further + * TLB fill. For guests, we ensure that the S1 MMU is + * temporarily enabled in the next context. */ val = cxt->tcr = read_sysreg_el1(SYS_TCR); val |= TCR_EPD1_MASK | TCR_EPD0_MASK; write_sysreg_el1(val, SYS_TCR); isb(); + + if (vcpu) { + val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR); + if (!(val & SCTLR_ELx_M)) { + val |= SCTLR_ELx_M; + write_sysreg_el1(val, SYS_SCTLR); + isb(); + } + } else { + /* The host S1 MMU is always enabled. */ + cxt->sctlr = SCTLR_ELx_M; + } } /* @@ -39,18 +107,40 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * ensuring that we always have an ISB, but not two ISBs back * to back. */ - __load_stage2(mmu, kern_hyp_va(mmu->arch)); + if (vcpu) + __load_host_stage2(); + else + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } -static void __tlb_switch_to_host(struct tlb_inv_context *cxt) +static void exit_vmid_context(struct tlb_inv_context *cxt) { - __load_host_stage2(); + struct kvm_s2_mmu *mmu = cxt->mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + + if (!mmu) + return; + + if (vcpu) + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + else + __load_host_stage2(); + + /* Ensure write of the old VMID */ + isb(); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { - /* Ensure write of the host VMID */ - isb(); - /* Restore the host's TCR_EL1 */ + if (!(cxt->sctlr & SCTLR_ELx_M)) { + write_sysreg_el1(cxt->sctlr, SYS_SCTLR); + isb(); + } + write_sysreg_el1(cxt->tcr, SYS_TCR); } } @@ -60,10 +150,8 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, { struct tlb_inv_context cxt; - dsb(ishst); - /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt, false); /* * We could do so much better if we had the VA as well. @@ -84,45 +172,78 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ish); isb(); + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt, true); + /* - * If the host is running at EL1 and we have a VPIPT I-cache, - * then we must perform I-cache maintenance at EL2 in order for - * it to have an effect on the guest. Since the guest cannot hit - * I-cache lines allocated with a different VMID, we don't need - * to worry about junk out of guest reset (we nuke the I-cache on - * VMID rollover), but we do need to be careful when remapping - * executable pages for the same guest. This can happen when KSM - * takes a CoW fault on an executable page, copies the page into - * a page that was previously mapped in the guest and then needs - * to invalidate the guest view of the I-cache for that page - * from EL1. To solve this, we invalidate the entire I-cache when - * unmapping a page from a guest if we have a VPIPT I-cache but - * the host is running at EL1. As above, we could do better if - * we had the VA. - * - * The moral of this story is: if you have a VPIPT I-cache, then - * you should be running with VHE enabled. + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... */ - if (icache_is_vpipt()) - icache_inval_all_pou(); + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } -void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) +void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, + phys_addr_t start, unsigned long pages) { struct tlb_inv_context cxt; + unsigned long stride; + + /* + * Since the range of addresses may not be mapped at + * the same level, assume the worst case as PAGE_SIZE + */ + stride = PAGE_SIZE; + start = round_down(start, stride); + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt, false); - dsb(ishst); + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); + + dsb(ish); + __tlbi(vmalle1is); + dsb(ish); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) +{ + struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt, false); __tlbi(vmalls12e1is); dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) @@ -130,32 +251,20 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt, false); __tlbi(vmalle1); asm volatile("ic iallu"); dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_vm_context(void) { - dsb(ishst); + /* Same remark as in enter_vmid_context() */ + dsb(ish); __tlbi(alle1is); - - /* - * VIPT and PIPT caches are not affected by VMID, so no maintenance - * is necessary across a VMID rollover. - * - * VPIPT caches constrain lookup and maintenance to the active VMID, - * so we need to invalidate lines with a stale VMID to avoid an ABA - * race after multiple rollovers. - * - */ - if (icache_is_vpipt()) - asm volatile("ic ialluis"); - dsb(ish); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index b11cf2c618a6..947ac1a951a5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -11,60 +11,22 @@ #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> - -#define KVM_PTE_TYPE BIT(1) -#define KVM_PTE_TYPE_BLOCK 0 -#define KVM_PTE_TYPE_PAGE 1 -#define KVM_PTE_TYPE_TABLE 1 - -#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) - -#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 -#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) - -#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) - -#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ - KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ - KVM_PTE_LEAF_ATTR_HI_S2_XN) - -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) -#define KVM_MAX_OWNER_ID 1 - -/* - * Used to indicate a pte for which a 'break-before-make' sequence is in - * progress. - */ -#define KVM_INVALID_PTE_LOCKED BIT(10) - struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; + const u64 start; u64 addr; - u64 end; + const u64 end; }; -static bool kvm_phys_is_valid(u64 phys) +static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) +{ + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); +} + +static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) { - return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); } static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) @@ -77,13 +39,13 @@ static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, if (granule > (ctx->end - ctx->addr)) return false; - if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) + if (!IS_ALIGNED(phys, granule)) return false; return IS_ALIGNED(ctx->addr, granule); } -static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) +static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level) { u64 shift = kvm_granule_shift(level); u64 mask = BIT(PAGE_SHIFT - 3) - 1; @@ -99,7 +61,7 @@ static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) return (addr & mask) >> shift; } -static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) +static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level) { struct kvm_pgtable pgt = { .ia_bits = ia_bits, @@ -109,9 +71,9 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) return kvm_pgd_page_idx(&pgt, -1ULL) + 1; } -static bool kvm_pte_table(kvm_pte_t pte, u32 level) +static bool kvm_pte_table(kvm_pte_t pte, s8 level) { - if (level == KVM_PGTABLE_MAX_LEVELS - 1) + if (level == KVM_PGTABLE_LAST_LEVEL) return false; if (!kvm_pte_valid(pte)) @@ -139,11 +101,11 @@ static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops return pte; } -static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) +static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) { kvm_pte_t pte = kvm_phys_to_pte(pa); - u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : - KVM_PTE_TYPE_BLOCK; + u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE : + KVM_PTE_TYPE_BLOCK; pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); pte |= FIELD_PREP(KVM_PTE_TYPE, type); @@ -168,12 +130,31 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, return walker->cb(ctx, visit); } +static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, + int r) +{ + /* + * Visitor callbacks return EAGAIN when the conditions that led to a + * fault are no longer reflected in the page tables due to a race to + * update a PTE. In the context of a fault handler this is interpreted + * as a signal to retry guest execution. + * + * Ignore the return code altogether for walkers outside a fault handler + * (e.g. write protecting a range of memory) and chug along with the + * page table walk. + */ + if (r == -EAGAIN) + return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); + + return !r; +} + static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, - kvm_pteref_t pteref, u32 level) + kvm_pteref_t pteref, s8 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); @@ -182,25 +163,38 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, .old = READ_ONCE(*ptep), .arg = data->walker->arg, .mm_ops = mm_ops, + .start = data->start, .addr = data->addr, .end = data->end, .level = level, .flags = flags, }; int ret = 0; + bool reload = false; kvm_pteref_t childp; bool table = kvm_pte_table(ctx.old, level); - if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) + if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) { ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); + reload = true; + } if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); + reload = true; + } + + /* + * Reload the page table after invoking the walker callback for leaf + * entries or after pre-order traversal, to allow the walker to descend + * into a newly installed or replaced table. + */ + if (reload) { ctx.old = READ_ONCE(*ptep); table = kvm_pte_table(ctx.old, level); } - if (ret) + if (!kvm_pgtable_walk_continue(data->walker, ret)) goto out; if (!table) { @@ -211,23 +205,27 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); - if (ret) + if (!kvm_pgtable_walk_continue(data->walker, ret)) goto out; if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); out: + if (kvm_pgtable_walk_continue(data->walker, ret)) + return 0; + return ret; } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level) { u32 idx; int ret = 0; - if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) + if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL || + level > KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { @@ -271,6 +269,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker) { struct kvm_pgtable_walk_data walk_data = { + .start = ALIGN_DOWN(addr, PAGE_SIZE), .addr = ALIGN_DOWN(addr, PAGE_SIZE), .end = PAGE_ALIGN(walk_data.addr + size), .walker = walker, @@ -289,7 +288,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct leaf_walk_data { kvm_pte_t pte; - u32 level; + s8 level; }; static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -304,7 +303,7 @@ static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, } int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, - kvm_pte_t *ptep, u32 *level) + kvm_pte_t *ptep, s8 *level) { struct leaf_walk_data data; struct kvm_pgtable_walker walker = { @@ -327,7 +326,7 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, } struct hyp_map_data { - u64 phys; + const u64 phys; kvm_pte_t attr; }; @@ -349,12 +348,16 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) if (device) return -EINVAL; + + if (system_supports_bti_kernel()) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; } else { attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; } attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); - attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; @@ -385,13 +388,12 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct hyp_map_data *data) { + u64 phys = data->phys + (ctx->addr - ctx->start); kvm_pte_t new; - u64 granule = kvm_granule_size(ctx->level), phys = data->phys; if (!kvm_block_mapping_supported(ctx, phys)) return false; - data->phys += granule; new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); if (ctx->old == new) return true; @@ -414,7 +416,7 @@ static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, if (hyp_map_walker_try_leaf(ctx, data)) return 0; - if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); @@ -470,7 +472,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; @@ -510,14 +512,19 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops) { - u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); + s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 - + ARM64_HW_PGTABLE_LEVELS(va_bits); + + if (start_level < KVM_PGTABLE_FIRST_LEVEL || + start_level > KVM_PGTABLE_LAST_LEVEL) + return -EINVAL; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = va_bits; - pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; + pgt->start_level = start_level; pgt->mm_ops = mm_ops; pgt->mmu = NULL; pgt->force_pte_cb = NULL; @@ -554,7 +561,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) } struct stage2_map_data { - u64 phys; + const u64 phys; kvm_pte_t attr; u8 owner_id; @@ -566,12 +573,15 @@ struct stage2_map_data { /* Force mappings to page granularity */ bool force_pte; + + /* Walk should update owner_id only */ + bool annotation; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) { u64 vtcr = VTCR_EL2_FLAGS; - u8 lvls; + s8 lvls; vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); @@ -582,14 +592,36 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) lvls = stage2_pgtable_levels(phys_shift); if (lvls < 2) lvls = 2; + + /* + * When LPA2 is enabled, the HW supports an extra level of translation + * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2 + * to as an addition to SL0 to enable encoding this extra start level. + * However, since we always use concatenated pages for the first level + * lookup, we will never need this extra level and therefore do not need + * to touch SL2. + */ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); +#ifdef CONFIG_ARM64_HW_AFDBM /* * Enable the Hardware Access Flag management, unconditionally - * on all CPUs. The features is RES0 on CPUs without the support - * and must be ignored by the CPUs. + * on all CPUs. In systems that have asymmetric support for the feature + * this allows KVM to leverage hardware support on the subset of cores + * that implement the feature. + * + * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by + * hardware) on implementations that do not advertise support for the + * feature. As such, setting HA unconditionally is safe, unless you + * happen to be running on a design that has unadvertised support for + * HAFDBS. Here be dragons. */ - vtcr |= VTCR_EL2_HA; + if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + vtcr |= VTCR_EL2_HA; +#endif /* CONFIG_ARM64_HW_AFDBM */ + + if (kvm_lpa2_is_enabled()) + vtcr |= VTCR_EL2_DS; /* Set the vmid bits */ vtcr |= (get_vmid_bits(mmfr1) == 16) ? @@ -601,26 +633,87 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) static bool stage2_has_fwb(struct kvm_pgtable *pgt) { - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) return false; return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); } +void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, + phys_addr_t addr, size_t size) +{ + unsigned long pages, inval_pages; + + if (!system_supports_tlb_range()) { + kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); + return; + } + + pages = size >> PAGE_SHIFT; + while (pages > 0) { + inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); + kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages); + + addr += inval_pages << PAGE_SHIFT; + pages -= inval_pages; + } +} + #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) +static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr) +{ + bool px, ux; + u8 xn; + + px = prot & KVM_PGTABLE_PROT_PX; + ux = prot & KVM_PGTABLE_PROT_UX; + + if (!cpus_have_final_cap(ARM64_HAS_XNX) && px != ux) + return -EINVAL; + + if (px && ux) + xn = 0b00; + else if (!px && ux) + xn = 0b01; + else if (!px && !ux) + xn = 0b10; + else + xn = 0b11; + + *attr &= ~KVM_PTE_LEAF_ATTR_HI_S2_XN; + *attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, xn); + return 0; +} + static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { - bool device = prot & KVM_PGTABLE_PROT_DEVICE; - kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : - KVM_S2_MEMATTR(pgt, NORMAL); + kvm_pte_t attr; u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + int r; - if (!(prot & KVM_PGTABLE_PROT_X)) - attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - else if (device) + switch (prot & (KVM_PGTABLE_PROT_DEVICE | + KVM_PGTABLE_PROT_NORMAL_NC)) { + case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: return -EINVAL; + case KVM_PGTABLE_PROT_DEVICE: + if (prot & KVM_PGTABLE_PROT_X) + return -EINVAL; + attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); + break; + case KVM_PGTABLE_PROT_NORMAL_NC: + if (prot & KVM_PGTABLE_PROT_X) + return -EINVAL; + attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); + break; + default: + attr = KVM_S2_MEMATTR(pgt, NORMAL); + } + + r = stage2_set_xn_attr(prot, &attr); + if (r) + return r; if (prot & KVM_PGTABLE_PROT_R) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; @@ -628,7 +721,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p if (prot & KVM_PGTABLE_PROT_W) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; - attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; @@ -647,8 +742,20 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) prot |= KVM_PGTABLE_PROT_R; if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) prot |= KVM_PGTABLE_PROT_W; - if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) - prot |= KVM_PGTABLE_PROT_X; + + switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) { + case 0b00: + prot |= KVM_PGTABLE_PROT_PX | KVM_PGTABLE_PROT_UX; + break; + case 0b01: + prot |= KVM_PGTABLE_PROT_UX; + break; + case 0b11: + prot |= KVM_PGTABLE_PROT_PX; + break; + default: + break; + } return prot; } @@ -717,14 +824,21 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) return false; - /* - * Perform the appropriate TLB invalidation based on the evicted pte - * value (if any). - */ - if (kvm_pte_table(ctx->old, ctx->level)) - kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); - else if (kvm_pte_valid(ctx->old)) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { + /* + * Perform the appropriate TLB invalidation based on the + * evicted pte value (if any). + */ + if (kvm_pte_table(ctx->old, ctx->level)) { + u64 size = kvm_granule_size(ctx->level); + u64 addr = ALIGN_DOWN(ctx->addr, size); + + kvm_tlb_flush_vmid_range(mmu, addr, size); + } else if (kvm_pte_valid(ctx->old)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, + ctx->addr, ctx->level); + } + } if (stage2_pte_is_counted(ctx->old)) mm_ops->put_page(ctx->ptep); @@ -744,16 +858,40 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n smp_store_release(ctx->ptep, new); } -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu, - struct kvm_pgtable_mm_ops *mm_ops) +static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) +{ + /* + * If FEAT_TLBIRANGE is implemented, defer the individual + * TLB invalidations until the entire walk is finished, and + * then use the range-based TLBI instructions to do the + * invalidations. Condition deferred TLB invalidation on the + * system supporting FWB as the optimization is entirely + * pointless when the unmap walker needs to perform CMOs. + */ + return system_supports_tlb_range() && stage2_has_fwb(pgt); +} + +static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, + struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) { + struct kvm_pgtable *pgt = ctx->arg; + /* - * Clear the existing PTE, and perform break-before-make with - * TLB maintenance if it was valid. + * Clear the existing PTE, and perform break-before-make if it was + * valid. Depending on the system support, defer the TLB maintenance + * for the same until the entire unmap walk is completed. */ if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + + if (kvm_pte_table(ctx->old, ctx->level)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + TLBI_TTL_UNKNOWN); + } else if (!stage2_unmap_defer_tlb_flush(pgt)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + ctx->level); + } } mm_ops->put_page(ctx->ptep); @@ -762,35 +900,50 @@ static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) { u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; - return memattr == KVM_S2_MEMATTR(pgt, NORMAL); + return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL); } static bool stage2_pte_executable(kvm_pte_t pte) { - return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); + return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); +} + +static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, + const struct stage2_map_data *data) +{ + u64 phys = data->phys; + + /* Work out the correct PA based on how far the walk has gotten */ + return phys + (ctx->addr - ctx->start); } static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) + u64 phys = stage2_map_walker_phys_addr(ctx, data); + + if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) return false; - return kvm_block_mapping_supported(ctx, data->phys); + if (data->annotation) + return true; + + return kvm_block_mapping_supported(ctx, phys); } static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { kvm_pte_t new; - u64 granule = kvm_granule_size(ctx->level), phys = data->phys; + u64 phys = stage2_map_walker_phys_addr(ctx, data); + u64 granule = kvm_granule_size(ctx->level); struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG; - if (kvm_phys_is_valid(phys)) + if (!data->annotation) new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else new = kvm_init_invalid_leaf_owner(data->owner_id); @@ -804,21 +957,36 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_pte_needs_update(ctx->old, new)) return -EAGAIN; + /* If we're only changing software bits, then store them and go! */ + if (!kvm_pgtable_walk_shared(ctx) && + !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) { + bool old_is_counted = stage2_pte_is_counted(ctx->old); + + if (old_is_counted != stage2_pte_is_counted(new)) { + if (old_is_counted) + mm_ops->put_page(ctx->ptep); + else + mm_ops->get_page(ctx->ptep); + } + WARN_ON_ONCE(!stage2_try_set_pte(ctx, new)); + return 0; + } + if (!stage2_try_break_pte(ctx, data->mmu)) return -EAGAIN; /* Perform CMOs before installation of the guest stage-2 PTE */ - if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && + stage2_pte_cacheable(pgt, new)) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), - granule); + granule); - if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && + stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); stage2_make_pte(ctx, new); - if (kvm_phys_is_valid(phys)) - data->phys += granule; return 0; } @@ -836,7 +1004,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, if (ret) return ret; - mm_ops->free_removed_table(childp, ctx->level); + mm_ops->free_unlinked_table(childp, ctx->level); return 0; } @@ -851,7 +1019,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (ret != -E2BIG) return ret; - if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; if (!data->memcache) @@ -881,7 +1049,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, * The TABLE_PRE callback runs for table entries on the way down, looking * for table entries which we could conceivably replace with a block entry * for this mapping. If it finds one it replaces the entry and calls - * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table. + * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. * * Otherwise, the LEAF callback performs the mapping at the existing leaves * instead. @@ -937,11 +1105,11 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, { int ret; struct stage2_map_data map_data = { - .phys = KVM_PHYS_INVALID, .mmu = pgt->mmu, .memcache = mc, .owner_id = owner_id, .force_pte = true, + .annotation = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -988,7 +1156,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, * block entry and rely on the remaining portions being faulted * back lazily. */ - stage2_put_pte(ctx, mmu, mm_ops); + stage2_unmap_put_pte(ctx, mmu, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), @@ -1002,20 +1170,26 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { + int ret; struct kvm_pgtable_walker walker = { .cb = stage2_unmap_walker, .arg = pgt, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - return kvm_pgtable_walk(pgt, addr, size, &walker); + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + if (stage2_unmap_defer_tlb_flush(pgt)) + /* Perform the deferred TLB invalidations */ + kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); + + return ret; } struct stage2_attr_data { kvm_pte_t attr_set; kvm_pte_t attr_clr; kvm_pte_t pte; - u32 level; + s8 level; }; static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -1026,7 +1200,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) - return 0; + return -EAGAIN; data->level = ctx->level; data->pte = pte; @@ -1058,7 +1232,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, u64 size, kvm_pte_t attr_set, kvm_pte_t attr_clr, kvm_pte_t *orig_pte, - u32 *level, enum kvm_pgtable_walk_flags flags) + s8 *level, enum kvm_pgtable_walk_flags flags) { int ret; kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; @@ -1091,42 +1265,73 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) NULL, NULL, 0); } -kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) +void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) { - kvm_pte_t pte = 0; - stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - &pte, NULL, 0); - dsb(ishst); - return pte; + int ret; + + ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, + NULL, NULL, flags); + if (!ret) + dsb(ishst); } -kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) +struct stage2_age_data { + bool mkold; + bool young; +}; + +static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - kvm_pte_t pte = 0; - stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF, - &pte, NULL, 0); + kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; + struct stage2_age_data *data = ctx->arg; + + if (!kvm_pte_valid(ctx->old) || new == ctx->old) + return 0; + + data->young = true; + + /* + * stage2_age_walker() is always called while holding the MMU lock for + * write, so this will always succeed. Nonetheless, this deliberately + * follows the race detection pattern of the other stage-2 walkers in + * case the locking mechanics of the MMU notifiers is ever changed. + */ + if (data->mkold && !stage2_try_set_pte(ctx, new)) + return -EAGAIN; + /* * "But where's the TLBI?!", you scream. * "Over in the core code", I sigh. * * See the '->clear_flush_young()' callback on the KVM mmu notifier. */ - return pte; + return 0; } -bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr) +bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, + u64 size, bool mkold) { - kvm_pte_t pte = 0; - stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0); - return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF; + struct stage2_age_data data = { + .mkold = mkold, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_age_walker, + .arg = &data, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); + return data.young; } int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot) + enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { + kvm_pte_t xn = 0, set = 0, clr = 0; + s8 level; int ret; - u32 level; - kvm_pte_t set = 0, clr = 0; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) return -EINVAL; @@ -1137,13 +1342,16 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_W) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; - if (prot & KVM_PGTABLE_PROT_X) - clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + ret = stage2_set_xn_attr(prot, &xn); + if (ret) + return ret; - ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, - KVM_PGTABLE_WALK_SHARED); - if (!ret) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); + set |= xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; + clr |= ~xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; + + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); + if (!ret || ret == -EAGAIN) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; } @@ -1153,7 +1361,7 @@ static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_pgtable *pgt = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; - if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) + if (!stage2_pte_cacheable(pgt, ctx->old)) return 0; if (mm_ops->dcache_clean_inval_poc) @@ -1176,6 +1384,162 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) return kvm_pgtable_walk(pgt, addr, size, &walker); } +kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, + u64 phys, s8 level, + enum kvm_pgtable_prot prot, + void *mc, bool force_pte) +{ + struct stage2_map_data map_data = { + .phys = phys, + .mmu = pgt->mmu, + .memcache = mc, + .force_pte = force_pte, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_SKIP_BBM_TLBI | + KVM_PGTABLE_WALK_SKIP_CMO, + .arg = &map_data, + }; + /* + * The input address (.addr) is irrelevant for walking an + * unlinked table. Construct an ambiguous IA range to map + * kvm_granule_size(level) worth of memory. + */ + struct kvm_pgtable_walk_data data = { + .walker = &walker, + .addr = 0, + .end = kvm_granule_size(level), + }; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + kvm_pte_t *pgtable; + int ret; + + if (!IS_ALIGNED(phys, kvm_granule_size(level))) + return ERR_PTR(-EINVAL); + + ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); + if (ret) + return ERR_PTR(ret); + + pgtable = mm_ops->zalloc_page(mc); + if (!pgtable) + return ERR_PTR(-ENOMEM); + + ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, + level + 1); + if (ret) { + kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); + return ERR_PTR(ret); + } + + return pgtable; +} + +/* + * Get the number of page-tables needed to replace a block with a + * fully populated tree up to the PTE entries. Note that @level is + * interpreted as in "level @level entry". + */ +static int stage2_block_get_nr_page_tables(s8 level) +{ + switch (level) { + case 1: + return PTRS_PER_PTE + 1; + case 2: + return 1; + case 3: + return 0; + default: + WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || + level > KVM_PGTABLE_LAST_LEVEL); + return -EINVAL; + }; +} + +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + struct kvm_mmu_memory_cache *mc = ctx->arg; + struct kvm_s2_mmu *mmu; + kvm_pte_t pte = ctx->old, new, *childp; + enum kvm_pgtable_prot prot; + s8 level = ctx->level; + bool force_pte; + int nr_pages; + u64 phys; + + /* No huge-pages exist at the last level */ + if (level == KVM_PGTABLE_LAST_LEVEL) + return 0; + + /* We only split valid block mappings */ + if (!kvm_pte_valid(pte)) + return 0; + + nr_pages = stage2_block_get_nr_page_tables(level); + if (nr_pages < 0) + return nr_pages; + + if (mc->nobjs >= nr_pages) { + /* Build a tree mapped down to the PTE granularity. */ + force_pte = true; + } else { + /* + * Don't force PTEs, so create_unlinked() below does + * not populate the tree up to the PTE level. The + * consequence is that the call will require a single + * page of level 2 entries at level 1, or a single + * page of PTEs at level 2. If we are at level 1, the + * PTEs will be created recursively. + */ + force_pte = false; + nr_pages = 1; + } + + if (mc->nobjs < nr_pages) + return -ENOMEM; + + mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); + phys = kvm_pte_to_phys(pte); + prot = kvm_pgtable_stage2_pte_prot(pte); + + childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, + level, prot, mc, force_pte); + if (IS_ERR(childp)) + return PTR_ERR(childp); + + if (!stage2_try_break_pte(ctx, mmu)) { + kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); + return -EAGAIN; + } + + /* + * Note, the contents of the page table are guaranteed to be made + * visible before the new PTE is assigned because stage2_make_pte() + * writes the PTE using smp_store_release(). + */ + new = kvm_init_table_pte(childp, mm_ops); + stage2_make_pte(ctx, new); + return 0; +} + +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_split_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = mc, + }; + int ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; +} int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, @@ -1183,10 +1547,10 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, kvm_pgtable_force_pte_cb_t force_pte_cb) { size_t pgd_sz; - u64 vtcr = mmu->arch->vtcr; + u64 vtcr = mmu->vtcr; u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); - u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); @@ -1209,43 +1573,86 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) { u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); - u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } -static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, - enum kvm_pgtable_walk_flags visit) +static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - if (!stage2_pte_is_counted(ctx->old)) + mm_ops->put_page(ctx->ptep); + return 0; +} + +static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + + if (mm_ops->page_count(childp) != 1) return 0; + /* + * Drop references and clear the now stale PTE to avoid rewalking the + * freed page table. + */ mm_ops->put_page(ctx->ptep); + mm_ops->put_page(childp); + kvm_clear_pte(ctx->ptep); + return 0; +} - if (kvm_pte_table(ctx->old, ctx->level)) - mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); +static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + if (!stage2_pte_is_counted(ctx->old)) + return 0; - return 0; + switch (visit) { + case KVM_PGTABLE_WALK_LEAF: + return stage2_free_leaf(ctx); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_free_table_post(ctx); + default: + return -EINVAL; + } } -void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); +} + +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); + + /* + * Since the pgtable is unlinked at this point, and not shared with + * other walkers, safely deference pgd with kvm_dereference_pteref_raw() + */ + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); pgt->pgd = NULL; } -void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); + kvm_pgtable_stage2_destroy_pgd(pgt); +} + +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { @@ -1266,4 +1673,7 @@ void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pg }; WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); + + WARN_ON(mm_ops->page_count(pgtable) != 1); + mm_ops->put_page(pgtable); } diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 87a54375bd6e..5fd99763b54d 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -20,7 +20,7 @@ static bool __is_be(struct kvm_vcpu *vcpu) if (vcpu_mode_is_32bit(vcpu)) return !!(read_sysreg_el2(SYS_SPSR) & PSR_AA32_E_BIT); - return !!(read_sysreg(SCTLR_EL1) & SCTLR_ELx_EE); + return !!(read_sysreg_el1(SYS_SCTLR) & SCTLR_ELx_EE); } /* @@ -63,6 +63,10 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) return -1; } + /* Handle deactivation as a normal exit */ + if ((fault_ipa - vgic->vgic_cpu_base) >= GIC_CPU_DEACTIVATE) + return 0; + rd = kvm_vcpu_dabt_get_rd(vcpu); addr = kvm_vgic_global_state.vcpu_hyp_va; addr += fault_ipa - vgic->vgic_cpu_base; diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 6cb638b184b1..0b670a033fd8 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -14,11 +14,13 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include "../../vgic/vgic.h" + #define vtr_to_max_lr_idx(v) ((v) & 0xf) #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) -static u64 __gic_v3_get_lr(unsigned int lr) +u64 __gic_v3_get_lr(unsigned int lr) { switch (lr & 0xf) { case 0: @@ -58,7 +60,7 @@ static u64 __gic_v3_get_lr(unsigned int lr) unreachable(); } -static void __gic_v3_set_lr(u64 val, int lr) +void __gic_v3_set_lr(u64 val, int lr) { switch (lr & 0xf) { case 0: @@ -196,6 +198,11 @@ static u32 __vgic_v3_read_ap1rn(int n) return val; } +static u64 compute_ich_hcr(struct vgic_v3_cpu_if *cpu_if) +{ + return cpu_if->vgic_hcr | vgic_ich_hcr_trap_bits(); +} + void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) { u64 used_lrs = cpu_if->used_lrs; @@ -212,14 +219,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) } } - if (used_lrs || cpu_if->its_vpe.its_vm) { + if (used_lrs) { int i; u32 elrsr; elrsr = read_gicreg(ICH_ELRSR_EL2); - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; @@ -229,6 +234,23 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) __gic_v3_set_lr(0, i); } } + + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { + u64 val = read_gicreg(ICH_HCR_EL2); + cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; + cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; + } + + write_gicreg(0, ICH_HCR_EL2); + + /* + * Hack alert: On NV, this results in a trap so that the above write + * actually takes effect... No synchronisation is necessary, as we + * only care about the effects when this traps. + */ + read_gicreg(ICH_MISR_EL2); } void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) @@ -236,12 +258,10 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) u64 used_lrs = cpu_if->used_lrs; int i; - if (used_lrs || cpu_if->its_vpe.its_vm) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } + for (i = 0; i < used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); /* * Ensure that writes to the LRs, and on non-VHE systems ensure that @@ -268,8 +288,16 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) * starting to mess with the rest of the GIC, and VMCR_EL2 in * particular. This logic must be called before * __vgic_v3_restore_state(). + * + * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is + * provisioned at all. In order to prevent illegal accesses to the + * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 + * so that the trap bits can take effect. Yes, we *loves* the GIC. */ - if (!cpu_if->vgic_sre) { + if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) { + write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); + isb(); + } else if (!cpu_if->vgic_sre) { write_gicreg(0, ICC_SRE_EL1); isb(); write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); @@ -287,38 +315,42 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } } - /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. - */ - write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, - ICC_SRE_EL2); + /* Only disable SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { + /* + * Prevent the guest from touching the ICC_SRE_EL1 system + * register. Note that this may not have any effect, as + * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. + */ + write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, + ICC_SRE_EL2); + } /* - * If we need to trap system registers, we must write - * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected, + * If we need to trap system registers, we must write ICH_HCR_EL2 + * anyway, even if no interrupts are being injected. Note that this + * also applies if we don't expect any system register access (no + * vgic at all). In any case, no need to provide MI configuration. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) + write_gicreg(vgic_ich_hcr_trap_bits() | ICH_HCR_EL2_En, ICH_HCR_EL2); } void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) { u64 val; - if (!cpu_if->vgic_sre) { - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - } - - val = read_gicreg(ICC_SRE_EL2); - write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); + /* Only restore SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { + val = read_gicreg(ICC_SRE_EL2); + write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); - if (!cpu_if->vgic_sre) { - /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ - isb(); - write_gicreg(1, ICC_SRE_EL1); + if (!cpu_if->vgic_sre) { + /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ + isb(); + write_gicreg(1, ICC_SRE_EL1); + } } /* @@ -326,7 +358,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) * no interrupts were being injected, and we disable it again here. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(0, ICH_HCR_EL2); } @@ -363,7 +395,7 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) } } -void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) +static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -413,29 +445,43 @@ void __vgic_v3_init_lrs(void) */ u64 __vgic_v3_get_gic_config(void) { - u64 val, sre = read_gicreg(ICC_SRE_EL1); + u64 val, sre; unsigned long flags = 0; /* - * To check whether we have a MMIO-based (GICv2 compatible) - * CPU interface, we need to disable the system register - * view. To do that safely, we have to prevent any interrupt - * from firing (which would be deadly). - * - * Note that this only makes sense on VHE, as interrupts are - * already masked for nVHE as part of the exception entry to - * EL2. + * In compat mode, we cannot access ICC_SRE_EL1 at any EL + * other than EL1 itself; just return the + * ICH_VTR_EL2. ICC_IDR0_EL1 is only implemented on a GICv5 + * system, so we first check if we have GICv5 support. */ - if (has_vhe()) - flags = local_daif_save(); + if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return read_gicreg(ICH_VTR_EL2); + sre = read_gicreg(ICC_SRE_EL1); /* + * To check whether we have a MMIO-based (GICv2 compatible) + * CPU interface, we need to disable the system register + * view. + * * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates * that to be able to set ICC_SRE_EL1.SRE to 0, all the * interrupt overrides must be set. You've got to love this. + * + * As we always run VHE with HCR_xMO set, no extra xMO + * manipulation is required in that case. + * + * To safely disable SRE, we have to prevent any interrupt + * from firing (which would be deadly). This only makes sense + * on VHE, as interrupts are already masked for nVHE as part + * of the exception entry to EL2. */ - sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); - isb(); + if (has_vhe()) { + flags = local_daif_save(); + } else { + sysreg_clear_set_hcr(0, HCR_AMO | HCR_FMO | HCR_IMO); + isb(); + } + write_gicreg(0, ICC_SRE_EL1); isb(); @@ -443,11 +489,13 @@ u64 __vgic_v3_get_gic_config(void) write_gicreg(sre, ICC_SRE_EL1); isb(); - sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); - isb(); - if (has_vhe()) + if (has_vhe()) { local_daif_restore(flags); + } else { + sysreg_clear_set_hcr(HCR_AMO | HCR_FMO | HCR_IMO, 0); + isb(); + } val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); val |= read_gicreg(ICH_VTR_EL2); @@ -455,16 +503,40 @@ u64 __vgic_v3_get_gic_config(void) return val; } -u64 __vgic_v3_read_vmcr(void) +static void __vgic_v3_compat_mode_enable(void) +{ + if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return; + + sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, 0, ICH_VCTLR_EL2_V3); + /* Wait for V3 to become enabled */ + isb(); +} + +static u64 __vgic_v3_read_vmcr(void) { return read_gicreg(ICH_VMCR_EL2); } -void __vgic_v3_write_vmcr(u32 vmcr) +static void __vgic_v3_write_vmcr(u32 vmcr) { write_gicreg(vmcr, ICH_VMCR_EL2); } +void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + __vgic_v3_compat_mode_enable(); + + /* + * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen + * is dependent on ICC_SRE_EL1.SRE, and we have to perform the + * VMCR_EL2 save/restore in the world switch. + */ + if (cpu_if->vgic_sre) + __vgic_v3_write_vmcr(cpu_if->vgic_vmcr); + __vgic_v3_restore_aprs(cpu_if); +} + static int __vgic_v3_bpr_min(void) { /* See Pseudocode for VPriorityGroup */ @@ -723,11 +795,11 @@ static void __vgic_v3_bump_eoicount(void) u32 hcr; hcr = read_gicreg(ICH_HCR_EL2); - hcr += 1 << ICH_HCR_EOIcount_SHIFT; + hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT; write_gicreg(hcr, ICH_HCR_EL2); } -static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vid = vcpu_get_reg(vcpu, rt); u64 lr_val; @@ -735,19 +807,25 @@ static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* EOImode == 0, nothing to be done here */ if (!(vmcr & ICH_VMCR_EOIM_MASK)) - return; + return 1; /* No deactivate to be performed on an LPI */ if (vid >= VGIC_MIN_LPI) - return; + return 1; lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); - if (lr == -1) { - __vgic_v3_bump_eoicount(); - return; + if (lr != -1) { + __vgic_v3_clear_active_lr(lr, lr_val); + return 1; } - __vgic_v3_clear_active_lr(lr, lr_val); + return 0; +} + +static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + if (!___vgic_v3_write_dir(vcpu, vmcr, rt)) + __vgic_v3_bump_eoicount(); } static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) @@ -983,9 +1061,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) - val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT); /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ @@ -1013,6 +1088,75 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) write_gicreg(vmcr, ICH_VMCR_EL2); } +static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, + u32 sysreg, bool is_read) +{ + u64 ich_hcr; + + if (!is_nested_ctxt(vcpu)) + return false; + + ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + switch (sysreg) { + case SYS_ICC_IGRPEN0_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_BPR0_EL1: + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_IAR0_EL1: + return ich_hcr & ICH_HCR_EL2_TALL0; + + case SYS_ICC_IGRPEN1_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP1Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(3): + case SYS_ICC_BPR1_EL1: + case SYS_ICC_EOIR1_EL1: + case SYS_ICC_HPPIR1_EL1: + case SYS_ICC_IAR1_EL1: + return ich_hcr & ICH_HCR_EL2_TALL1; + + case SYS_ICC_DIR_EL1: + if (ich_hcr & ICH_HCR_EL2_TDIR) + return true; + + fallthrough; + + case SYS_ICC_RPR_EL1: + case SYS_ICC_CTLR_EL1: + case SYS_ICC_PMR_EL1: + return ich_hcr & ICH_HCR_EL2_TC; + + default: + return false; + } +} + int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) { int rt; @@ -1022,6 +1166,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) bool is_read; u32 sysreg; + if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return 0; + esr = kvm_vcpu_get_esr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { if (!kvm_condition_valid(vcpu)) { @@ -1036,6 +1183,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read)) + return 0; + switch (sysreg) { case SYS_ICC_IAR0_EL1: case SYS_ICC_IAR1_EL1: @@ -1110,6 +1260,21 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) case SYS_ICC_DIR_EL1: if (unlikely(is_read)) return 0; + /* + * Full exit if required to handle overflow deactivation, + * unless we can emulate it in the LRs (likely the majority + * of the cases). + */ + if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) { + int ret; + + ret = ___vgic_v3_write_dir(vcpu, __vgic_v3_read_vmcr(), + kvm_vcpu_sys_get_rt(vcpu)); + if (ret) + __kvm_skip_instr(vcpu); + + return ret; + } fn = __vgic_v3_write_dir; break; case SYS_ICC_RPR_EL1: diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile index 3b9e5464b5b3..afc4aed9231a 100644 --- a/arch/arm64/kvm/hyp/vhe/Makefile +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -6,6 +6,8 @@ asflags-y := -D__KVM_VHE_HYPERVISOR__ ccflags-y := -D__KVM_VHE_HYPERVISOR__ +CFLAGS_switch.o += -Wno-override-init + obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c index 289689b2682d..0100339b09e0 100644 --- a/arch/arm64/kvm/hyp/vhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -19,8 +19,3 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 1a97391fedd2..9984c492305a 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -33,37 +33,110 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); -static void __activate_traps(struct kvm_vcpu *vcpu) -{ - u64 val; +/* + * HCR_EL2 bits that the NV guest can freely change (no RES0/RES1 + * semantics, irrespective of the configuration), but that cannot be + * applied to the actual HW as things would otherwise break badly. + * + * - TGE: we want the guest to use EL1, which is incompatible with + * this bit being set + * + * - API/APK: they are already accounted for by vcpu_load(), and can + * only take effect across a load/put cycle (such as ERET) + * + * - FIEN: no way we let a guest have access to the RAS "Common Fault + * Injection" thing, whatever that does + */ +#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK | HCR_FIEN) - ___activate_traps(vcpu); +static u64 __compute_hcr(struct kvm_vcpu *vcpu) +{ + u64 guest_hcr, hcr = vcpu->arch.hcr_el2; - val = read_sysreg(cpacr_el1); - val |= CPACR_EL1_TTA; - val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | - CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); + if (!vcpu_has_nv(vcpu)) + return hcr; /* - * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to - * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, - * except for some missing controls, such as TAM. - * In this case, CPTR_EL2.TAM has the same position with or without - * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM - * shift value for trapping the AMU accesses. + * We rely on the invariant that a vcpu entered from HYP + * context must also exit in the same context, as only an ERET + * instruction can kick us out of it, and we obviously trap + * that sucker. PSTATE.M will get fixed-up on exit. */ + if (is_hyp_ctxt(vcpu)) { + host_data_set_flag(VCPU_IN_HYP_CONTEXT); + + hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; - val |= CPTR_EL2_TAM; + if (!vcpu_el2_e2h_is_set(vcpu)) + hcr |= HCR_NV1; - if (guest_owns_fp_regs(vcpu)) { - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; + /* + * Nothing in HCR_EL2 should impact running in hypervisor + * context, apart from bits we have defined as RESx (E2H, + * HCD and co), or that cannot be set directly (the EXCLUDE + * bits). Given that we OR the guest's view with the host's, + * we can use the 0 value as the starting point, and only + * use the config-driven RES1 bits. + */ + guest_hcr = kvm_vcpu_apply_reg_masks(vcpu, HCR_EL2, 0); + + write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); } else { - val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); - __activate_traps_fpsimd32(vcpu); + host_data_clear_flag(VCPU_IN_HYP_CONTEXT); + + guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + if (guest_hcr & HCR_NV) { + u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id())); + + /* Inherit the low bits from the actual register */ + va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0); + write_sysreg_s(va, SYS_VNCR_EL2); + + /* Force NV2 in case the guest is forgetful... */ + guest_hcr |= HCR_NV2; + } + + /* + * Exclude the guest's TWED configuration if it hasn't set TWE + * to avoid potentially delaying traps for the host. + */ + if (!(guest_hcr & HCR_TWE)) + guest_hcr &= ~(HCR_EL2_TWEDEn | HCR_EL2_TWEDEL); + } + + BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) && + host_data_test_flag(L1_VNCR_MAPPED)); + + return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); +} + +static void __activate_traps(struct kvm_vcpu *vcpu) +{ + u64 val; + + ___activate_traps(vcpu, __compute_hcr(vcpu)); + + if (has_cntpoff()) { + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * We're entrering the guest. Reload the correct + * values from memory now that TGE is clear. + */ + if (map.direct_ptimer == vcpu_ptimer(vcpu)) + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + if (map.direct_ptimer == vcpu_hptimer(vcpu)) + val = __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); + + if (map.direct_ptimer) { + write_sysreg_el0(val, SYS_CNTP_CVAL); + isb(); + } } - write_sysreg(val, cpacr_el1); + __activate_cptr_traps(vcpu); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); } @@ -75,7 +148,31 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) ___deactivate_traps(vcpu); - write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); + + if (has_cntpoff()) { + struct timer_map map; + u64 val, offset; + + get_timer_map(vcpu, &map); + + /* + * We're exiting the guest. Save the latest CVAL value + * to memory and apply the offset now that TGE is set. + */ + val = read_sysreg_el0(SYS_CNTP_CVAL); + if (map.direct_ptimer == vcpu_ptimer(vcpu)) + __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, val); + if (map.direct_ptimer == vcpu_hptimer(vcpu)) + __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, val); + + offset = read_sysreg_s(SYS_CNTPOFF_EL2); + + if (map.direct_ptimer && offset) { + write_sysreg_el0(val + offset, SYS_CNTP_CVAL); + isb(); + } + } /* * ARM errata 1165522 and 1530923 require the actual execution of the @@ -84,7 +181,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); - write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); + __deactivate_cptr_traps(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); @@ -92,34 +189,383 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) } NOKPROBE_SYMBOL(__deactivate_traps); -void activate_traps_vhe_load(struct kvm_vcpu *vcpu) +/* + * Disable IRQs in __vcpu_{load,put}_{activate,deactivate}_traps() to + * prevent a race condition between context switching of PMUSERENR_EL0 + * in __{activate,deactivate}_traps_common() and IPIs that attempts to + * update PMUSERENR_EL0. See also kvm_set_pmuserenr(). + */ +static void __vcpu_load_activate_traps(struct kvm_vcpu *vcpu) { + unsigned long flags; + + local_irq_save(flags); __activate_traps_common(vcpu); + local_irq_restore(flags); } -void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) +static void __vcpu_put_deactivate_traps(struct kvm_vcpu *vcpu) { + unsigned long flags; + + local_irq_save(flags); __deactivate_traps_common(vcpu); + local_irq_restore(flags); +} + +void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu) +{ + host_data_ptr(host_ctxt)->__hyp_running_vcpu = vcpu; + + __vcpu_load_switch_sysregs(vcpu); + __vcpu_load_activate_traps(vcpu); + __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); +} + +void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) +{ + __vcpu_put_deactivate_traps(vcpu); + __vcpu_put_switch_sysregs(vcpu); + + host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL; +} + +static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) +{ + unsigned long ctl; + u64 cval, cnt; + bool stat; + + switch (reg) { + case CNTP_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + cnt = compute_counter_value(vcpu_ptimer(vcpu)); + break; + case CNTV_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); + cnt = compute_counter_value(vcpu_vtimer(vcpu)); + break; + default: + BUG(); + } + + stat = cval <= cnt; + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat); + + return ctl; +} + +static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr, val; + + /* + * Having FEAT_ECV allows for a better quality of timer emulation. + * However, this comes at a huge cost in terms of traps. Try and + * satisfy the reads from guest's hypervisor context without + * returning to the kernel if we can. + */ + if (!is_hyp_ctxt(vcpu)) + return false; + + esr = kvm_vcpu_get_esr(vcpu); + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ) + return false; + + switch (esr_sys64_to_sysreg(esr)) { + case SYS_CNTP_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTP_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + break; + case SYS_CNTP_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el0(SYS_CNTP_CVAL); + + if (!has_cntpoff()) + val -= timer_get_offset(vcpu_hptimer(vcpu)); + } else { + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + } + break; + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + val = compute_counter_value(vcpu_hptimer(vcpu)); + break; + case SYS_CNTV_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTV_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CVAL); + else + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + val = compute_counter_value(vcpu_hvtimer(vcpu)); + break; + default: + return false; + } + + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 spsr, elr, mode; + + /* + * Going through the whole put/load motions is a waste of time + * if this is a VHE guest hypervisor returning to its own + * userspace, or the hypervisor performing a local exception + * return. No need to save/restore registers, no need to + * switch S2 MMU. Just do the canonical ERET. + * + * Unless the trap has to be forwarded further down the line, + * of course... + */ + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) || + (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_ERET)) + return false; + + spsr = read_sysreg_el1(SYS_SPSR); + mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL0t: + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + return false; + break; + case PSR_MODE_EL2t: + mode = PSR_MODE_EL1t; + break; + case PSR_MODE_EL2h: + mode = PSR_MODE_EL1h; + break; + default: + return false; + } + + /* If ERETAx fails, take the slow path */ + if (esr_iss_is_eretax(esr)) { + if (!(vcpu_has_ptrauth(vcpu) && kvm_auth_eretax(vcpu, &elr))) + return false; + } else { + elr = read_sysreg_el1(SYS_ELR); + } + + spsr = (spsr & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; + + write_sysreg_el2(spsr, SYS_SPSR); + write_sysreg_el2(elr, SYS_ELR); + + return true; +} + +static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + int ret = -EINVAL; + u32 instr; + u64 val; + + /* + * Ideally, we would never trap on EL2 S1 TLB invalidations using + * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}. + * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2, + * meaning that we can't track changes to the virtual TGE bit. So we + * have to leave HCR_EL2.TTLB set on the host. Oopsie... + * + * Try and handle these invalidation as quickly as possible, without + * fully exiting. Note that we don't need to consider any forwarding + * here, as having E2H+TGE set is the very definition of being + * InHost. + * + * For the lesser hypervisors out there that have failed to get on + * with the VHE program, we can also handle the nVHE style of EL2 + * invalidation. + */ + if (!(is_hyp_ctxt(vcpu))) + return false; + + instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + + if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) && + vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) || + kvm_supported_tlbi_s1e2_op (vcpu, instr)) + ret = __kvm_tlbi_s1e2(NULL, val, instr); + + if (ret) + return false; + + /* + * If we have to check for any VNCR mapping being invalidated, + * go back to the slow path for further processing. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) && + atomic_read(&vcpu->kvm->arch.vncr_map_count)) + return false; + + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + int rt; + + if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1) + return false; + + rt = kvm_vcpu_sys_get_rt(vcpu); + + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) { + vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2)); + } else { + vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2); + __activate_cptr_traps(vcpu); + } + + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + + if (!vcpu_has_nv(vcpu)) + return false; + + if (sysreg != SYS_ZCR_EL2) + return false; + + if (guest_owns_fp_regs()) + return false; + + /* + * ZCR_EL2 traps are handled in the slow path, with the expectation + * that the guest's FP context has already been loaded onto the CPU. + * + * Load the guest's FP context and unconditionally forward to the + * slow path for handling (i.e. return false). + */ + kvm_hyp_handle_fpsimd(vcpu, exit_code); + return false; +} + +static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_timer(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_zcr_el2(vcpu, exit_code)) + return true; + + return kvm_hyp_handle_sysreg(vcpu, exit_code); +} + +static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 iss; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return false; + + /* + * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2 + * is populated with a correct ISS for a sysreg trap. These fruity + * parts are 64bit only, so unconditionally set IL. + */ + iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2)); + vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) | + FIELD_PREP(ESR_ELx_ISS_MASK, iss) | + ESR_ELx_IL; + return false; } static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, - [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg_vhe, [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, - [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, + [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, + [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret, + [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, + + /* Apple shenanigans */ + [0x3F] = kvm_hyp_handle_impdef, }; -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { - return hyp_exit_handlers; -} + synchronize_vcpu_pstate(vcpu, exit_code); -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) -{ + /* + * If we were in HYP context on entry, adjust the PSTATE view + * so that the usual helpers work correctly. This enforces our + * invariant that the guest's HYP context status is preserved + * across a run. + */ + if (vcpu_has_nv(vcpu) && + unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) { + u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL1t: + mode = PSR_MODE_EL2t; + break; + case PSR_MODE_EL1h: + mode = PSR_MODE_EL2h; + break; + } + + *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); + *vcpu_cpsr(vcpu) |= mode; + } + + /* Apply extreme paranoia! */ + BUG_ON(vcpu_has_nv(vcpu) && + !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu)); + + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } /* Switch to the guest for VHE systems running in EL2 */ @@ -129,24 +575,19 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt; u64 exit_code; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; - host_ctxt->__hyp_running_vcpu = vcpu; + host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; + fpsimd_lazy_switch_to_guest(vcpu); + sysreg_save_host_state_vhe(host_ctxt); /* - * ARM erratum 1165522 requires us to configure both stage 1 and - * stage 2 translation for the guest context before we clear - * HCR_EL2.TGE. - * - * We have already configured the guest's stage 1 translation in - * kvm_vcpu_load_sysregs_vhe above. We must now call - * __load_stage2 before __activate_traps, because - * __load_stage2 configures stage 2 translation, and - * __activate_traps clear HCR_EL2.TGE (among other things). + * Note that ARM erratum 1165522 requires us to configure both stage 1 + * and stage 2 translation for the guest context before we clear + * HCR_EL2.TGE. The stage 1 and stage 2 guest context has already been + * loaded on the CPU in kvm_vcpu_load_vhe(). */ - __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); __activate_traps(vcpu); __kvm_adjust_pc(vcpu); @@ -167,11 +608,21 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_restore_host_state_vhe(host_ctxt); - if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) - __fpsimd_save_fpexc32(vcpu); - __debug_switch_to_host(vcpu); + /* + * Ensure that all system register writes above have taken effect + * before returning to the host. In VHE mode, CPTR traps for + * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be + * manipulated after the ISB. + */ + isb(); + + fpsimd_lazy_switch_to_host(vcpu); + + if (guest_owns_fp_regs()) + __fpsimd_save_fpexc32(vcpu); + return exit_code; } NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); @@ -201,22 +652,15 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ local_daif_restore(DAIF_PROCCTX_NOIRQ); - /* - * When we exit from the guest we change a number of CPU configuration - * parameters, such as traps. Make sure these changes take effect - * before running the host or additional guests. - */ - isb(); - return ret; } -static void __hyp_call_panic(u64 spsr, u64 elr, u64 par) +static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) { struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; __deactivate_traps(vcpu); @@ -236,7 +680,6 @@ void __noreturn hyp_panic(void) u64 par = read_sysreg_par(); __hyp_call_panic(spsr, elr, par); - unreachable(); } asmlinkage void kvm_unexpected_el2_exception(void) diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 7b44f6b3b547..f28c6cf4fe1b 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -13,6 +13,139 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_nested.h> + +static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) +{ + /* These registers are common with EL1 */ + __vcpu_assign_sys_reg(vcpu, PAR_EL1, read_sysreg(par_el1)); + __vcpu_assign_sys_reg(vcpu, TPIDR_EL1, read_sysreg(tpidr_el1)); + + __vcpu_assign_sys_reg(vcpu, ESR_EL2, read_sysreg_el1(SYS_ESR)); + __vcpu_assign_sys_reg(vcpu, AFSR0_EL2, read_sysreg_el1(SYS_AFSR0)); + __vcpu_assign_sys_reg(vcpu, AFSR1_EL2, read_sysreg_el1(SYS_AFSR1)); + __vcpu_assign_sys_reg(vcpu, FAR_EL2, read_sysreg_el1(SYS_FAR)); + __vcpu_assign_sys_reg(vcpu, MAIR_EL2, read_sysreg_el1(SYS_MAIR)); + __vcpu_assign_sys_reg(vcpu, VBAR_EL2, read_sysreg_el1(SYS_VBAR)); + __vcpu_assign_sys_reg(vcpu, CONTEXTIDR_EL2, read_sysreg_el1(SYS_CONTEXTIDR)); + __vcpu_assign_sys_reg(vcpu, AMAIR_EL2, read_sysreg_el1(SYS_AMAIR)); + + /* + * In VHE mode those registers are compatible between EL1 and EL2, + * and the guest uses the _EL1 versions on the CPU naturally. + * So we save them into their _EL2 versions here. + * For nVHE mode we trap accesses to those registers, so our + * _EL2 copy in sys_regs[] is always up-to-date and we don't need + * to save anything here. + */ + if (vcpu_el2_e2h_is_set(vcpu)) { + u64 val; + + /* + * We don't save CPTR_EL2, as accesses to CPACR_EL1 + * are always trapped, ensuring that the in-memory + * copy is always up-to-date. A small blessing... + */ + __vcpu_assign_sys_reg(vcpu, SCTLR_EL2, read_sysreg_el1(SYS_SCTLR)); + __vcpu_assign_sys_reg(vcpu, TTBR0_EL2, read_sysreg_el1(SYS_TTBR0)); + __vcpu_assign_sys_reg(vcpu, TTBR1_EL2, read_sysreg_el1(SYS_TTBR1)); + __vcpu_assign_sys_reg(vcpu, TCR_EL2, read_sysreg_el1(SYS_TCR)); + + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { + __vcpu_assign_sys_reg(vcpu, TCR2_EL2, read_sysreg_el1(SYS_TCR2)); + + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + __vcpu_assign_sys_reg(vcpu, PIRE0_EL2, read_sysreg_el1(SYS_PIRE0)); + __vcpu_assign_sys_reg(vcpu, PIR_EL2, read_sysreg_el1(SYS_PIR)); + } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + __vcpu_assign_sys_reg(vcpu, POR_EL2, read_sysreg_el1(SYS_POR)); + } + + /* + * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where + * the interesting CNTHCTL_EL2 bits live. So preserve these + * bits when reading back the guest-visible value. + */ + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, &=, ~CNTKCTL_VALID_BITS); + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, |=, val); + } + + __vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1)); + __vcpu_assign_sys_reg(vcpu, ELR_EL2, read_sysreg_el1(SYS_ELR)); + __vcpu_assign_sys_reg(vcpu, SPSR_EL2, read_sysreg_el1(SYS_SPSR)); + + if (ctxt_has_sctlr2(&vcpu->arch.ctxt)) + __vcpu_assign_sys_reg(vcpu, SCTLR2_EL2, read_sysreg_el1(SYS_SCTLR2)); +} + +static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* These registers are common with EL1 */ + write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); + write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); + + write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2); + write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); + write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); + + if (vcpu_el2_e2h_is_set(vcpu)) { + /* + * In VHE mode those registers are compatible between + * EL1 and EL2. + */ + write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL); + } else { + /* + * CNTHCTL_EL2 only affects EL1 when running nVHE, so + * no need to restore it. + */ + val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2)); + write_sysreg_el1(val, SYS_SCTLR); + val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2)); + write_sysreg_el1(val, SYS_CPACR); + val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2)); + write_sysreg_el1(val, SYS_TTBR0); + val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2)); + write_sysreg_el1(val, SYS_TCR); + } + + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2); + + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0); + } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR); + } + + write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR); + write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); + + if (ctxt_has_sctlr2(&vcpu->arch.ctxt)) + write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR2_EL2), SYS_SCTLR2); +} /* * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and @@ -51,7 +184,7 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); /** - * kvm_vcpu_load_sysregs_vhe - Load guest system registers to the physical CPU + * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU * * @vcpu: The VCPU pointer * @@ -61,15 +194,27 @@ NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); * and loading system register state early avoids having to load them on * every entry to the VM. */ -void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) +void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; + u64 midr, mpidr; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); /* + * When running a normal EL1 guest, we only load a new vcpu + * after a context switch, which imvolves a DSB, so all + * speculative EL1&0 walks will have already completed. + * If running NV, the vcpu may transition between vEL1 and + * vEL2 without a context switch, so make sure we complete + * those walks before loading a new context. + */ + if (vcpu_has_nv(vcpu)) + dsb(nsh); + + /* * Load guest EL1 and user state * * We must restore the 32-bit state before the sysregs, thanks @@ -77,15 +222,30 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); - __sysreg_restore_el1_state(guest_ctxt); - vcpu_set_flag(vcpu, SYSREGS_ON_CPU); + if (unlikely(is_hyp_ctxt(vcpu))) { + __sysreg_restore_vel2_state(vcpu); + } else { + if (vcpu_has_nv(vcpu)) { + /* + * As we're restoring a nested guest, set the value + * provided by the guest hypervisor. + */ + midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2); + mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); + } else { + midr = ctxt_midr_el1(guest_ctxt); + mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); + } + + __sysreg_restore_el1_state(guest_ctxt, midr, mpidr); + } - activate_traps_vhe_load(vcpu); + vcpu_set_flag(vcpu, SYSREGS_ON_CPU); } /** - * kvm_vcpu_put_sysregs_vhe - Restore host system registers to the physical CPU + * __vcpu_put_switch_sysregs - Restore host system registers to the physical CPU * * @vcpu: The VCPU pointer * @@ -95,15 +255,18 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) * and deferring saving system register state until we're no longer running the * VCPU avoids having to save them on every exit from the VM. */ -void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) +void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; - deactivate_traps_vhe_put(vcpu); + host_ctxt = host_data_ptr(host_ctxt); + + if (unlikely(is_hyp_ctxt(vcpu))) + __sysreg_save_vel2_state(vcpu); + else + __sysreg_save_el1_state(guest_ctxt); - __sysreg_save_el1_state(guest_ctxt); __sysreg_save_user_state(guest_ctxt); __sysreg32_save_state(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 24cef9b87f9e..ec2569818629 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -11,18 +11,25 @@ #include <asm/tlbflush.h> struct tlb_inv_context { - unsigned long flags; - u64 tcr; - u64 sctlr; + struct kvm_s2_mmu *mmu; + unsigned long flags; + u64 tcr; + u64 sctlr; }; -static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, - struct tlb_inv_context *cxt) +static void enter_vmid_context(struct kvm_s2_mmu *mmu, + struct tlb_inv_context *cxt) { + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); u64 val; local_irq_save(cxt->flags); + if (vcpu && mmu != vcpu->arch.hw_mmu) + cxt->mmu = vcpu->arch.hw_mmu; + else + cxt->mmu = NULL; + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* * For CPUs that are affected by ARM errata 1165522 or 1530923, @@ -56,20 +63,23 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, __load_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; - write_sysreg(val, hcr_el2); + write_sysreg_hcr(val); isb(); } -static void __tlb_switch_to_host(struct tlb_inv_context *cxt) +static void exit_vmid_context(struct tlb_inv_context *cxt) { /* * We're done with the TLB operation, let's restore the host's * view of HCR_EL2. */ - write_sysreg(0, vttbr_el2); - write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); isb(); + /* ... and the stage-2 MMU context that we switched away from */ + if (cxt->mmu) + __load_stage2(cxt->mmu, cxt->mmu->arch); + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* Restore the registers to what they were */ write_sysreg_el1(cxt->tcr, SYS_TCR); @@ -87,7 +97,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); /* * We could do so much better if we had the VA as well. @@ -108,7 +118,68 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + dsb(nshst); + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); + + exit_vmid_context(&cxt); +} + +void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, + phys_addr_t start, unsigned long pages) +{ + struct tlb_inv_context cxt; + unsigned long stride; + + /* + * Since the range of addresses may not be mapped at + * the same level, assume the worst case as PAGE_SIZE + */ + stride = PAGE_SIZE; + start = round_down(start, stride); + + dsb(ishst); + + /* Switch to requested VMID */ + enter_vmid_context(mmu, &cxt); + + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); + + dsb(ish); + __tlbi(vmalle1is); + dsb(ish); + isb(); + + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) @@ -118,13 +189,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); __tlbi(vmalls12e1is); dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) @@ -132,32 +203,166 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); __tlbi(vmalle1); asm volatile("ic iallu"); dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_vm_context(void) { dsb(ishst); __tlbi(alle1is); + dsb(ish); +} + +/* + * TLB invalidation emulation for NV. For any given instruction, we + * perform the following transformtions: + * + * - a TLBI targeting EL2 S1 is remapped to EL1 S1 + * - a non-shareable TLBI is upgraded to being inner-shareable + * - an outer-shareable TLBI is also mapped to inner-shareable + * - an nXS TLBI is upgraded to XS + */ +int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding) +{ + struct tlb_inv_context cxt; + int ret = 0; /* - * VIPT and PIPT caches are not affected by VMID, so no maintenance - * is necessary across a VMID rollover. - * - * VPIPT caches constrain lookup and maintenance to the active VMID, - * so we need to invalidate lines with a stale VMID to avoid an ABA - * race after multiple rollovers. - * + * The guest will have provided its own DSB ISHST before trapping. + * If it hasn't, that's its own problem, and we won't paper over it + * (plus, there is plenty of extra synchronisation before we even + * get here...). */ - if (icache_is_vpipt()) - asm volatile("ic ialluis"); + if (mmu) + enter_vmid_context(mmu, &cxt); + + switch (sys_encoding) { + case OP_TLBI_ALLE2: + case OP_TLBI_ALLE2IS: + case OP_TLBI_ALLE2OS: + case OP_TLBI_VMALLE1: + case OP_TLBI_VMALLE1IS: + case OP_TLBI_VMALLE1OS: + case OP_TLBI_ALLE2NXS: + case OP_TLBI_ALLE2ISNXS: + case OP_TLBI_ALLE2OSNXS: + case OP_TLBI_VMALLE1NXS: + case OP_TLBI_VMALLE1ISNXS: + case OP_TLBI_VMALLE1OSNXS: + __tlbi(vmalle1is); + break; + case OP_TLBI_VAE2: + case OP_TLBI_VAE2IS: + case OP_TLBI_VAE2OS: + case OP_TLBI_VAE1: + case OP_TLBI_VAE1IS: + case OP_TLBI_VAE1OS: + case OP_TLBI_VAE2NXS: + case OP_TLBI_VAE2ISNXS: + case OP_TLBI_VAE2OSNXS: + case OP_TLBI_VAE1NXS: + case OP_TLBI_VAE1ISNXS: + case OP_TLBI_VAE1OSNXS: + __tlbi(vae1is, va); + break; + case OP_TLBI_VALE2: + case OP_TLBI_VALE2IS: + case OP_TLBI_VALE2OS: + case OP_TLBI_VALE1: + case OP_TLBI_VALE1IS: + case OP_TLBI_VALE1OS: + case OP_TLBI_VALE2NXS: + case OP_TLBI_VALE2ISNXS: + case OP_TLBI_VALE2OSNXS: + case OP_TLBI_VALE1NXS: + case OP_TLBI_VALE1ISNXS: + case OP_TLBI_VALE1OSNXS: + __tlbi(vale1is, va); + break; + case OP_TLBI_ASIDE1: + case OP_TLBI_ASIDE1IS: + case OP_TLBI_ASIDE1OS: + case OP_TLBI_ASIDE1NXS: + case OP_TLBI_ASIDE1ISNXS: + case OP_TLBI_ASIDE1OSNXS: + __tlbi(aside1is, va); + break; + case OP_TLBI_VAAE1: + case OP_TLBI_VAAE1IS: + case OP_TLBI_VAAE1OS: + case OP_TLBI_VAAE1NXS: + case OP_TLBI_VAAE1ISNXS: + case OP_TLBI_VAAE1OSNXS: + __tlbi(vaae1is, va); + break; + case OP_TLBI_VAALE1: + case OP_TLBI_VAALE1IS: + case OP_TLBI_VAALE1OS: + case OP_TLBI_VAALE1NXS: + case OP_TLBI_VAALE1ISNXS: + case OP_TLBI_VAALE1OSNXS: + __tlbi(vaale1is, va); + break; + case OP_TLBI_RVAE2: + case OP_TLBI_RVAE2IS: + case OP_TLBI_RVAE2OS: + case OP_TLBI_RVAE1: + case OP_TLBI_RVAE1IS: + case OP_TLBI_RVAE1OS: + case OP_TLBI_RVAE2NXS: + case OP_TLBI_RVAE2ISNXS: + case OP_TLBI_RVAE2OSNXS: + case OP_TLBI_RVAE1NXS: + case OP_TLBI_RVAE1ISNXS: + case OP_TLBI_RVAE1OSNXS: + __tlbi(rvae1is, va); + break; + case OP_TLBI_RVALE2: + case OP_TLBI_RVALE2IS: + case OP_TLBI_RVALE2OS: + case OP_TLBI_RVALE1: + case OP_TLBI_RVALE1IS: + case OP_TLBI_RVALE1OS: + case OP_TLBI_RVALE2NXS: + case OP_TLBI_RVALE2ISNXS: + case OP_TLBI_RVALE2OSNXS: + case OP_TLBI_RVALE1NXS: + case OP_TLBI_RVALE1ISNXS: + case OP_TLBI_RVALE1OSNXS: + __tlbi(rvale1is, va); + break; + case OP_TLBI_RVAAE1: + case OP_TLBI_RVAAE1IS: + case OP_TLBI_RVAAE1OS: + case OP_TLBI_RVAAE1NXS: + case OP_TLBI_RVAAE1ISNXS: + case OP_TLBI_RVAAE1OSNXS: + __tlbi(rvaae1is, va); + break; + case OP_TLBI_RVAALE1: + case OP_TLBI_RVAALE1IS: + case OP_TLBI_RVAALE1OS: + case OP_TLBI_RVAALE1NXS: + case OP_TLBI_RVAALE1ISNXS: + case OP_TLBI_RVAALE1OSNXS: + __tlbi(rvaale1is, va); + break; + default: + ret = -EINVAL; + } dsb(ish); + isb(); + + if (mmu) + exit_vmid_context(&cxt); + + return ret; } diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index c9f401fa01a9..58c5fe7d7572 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -15,6 +15,8 @@ GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0) #define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES \ GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2 \ + GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT - 1, 0) static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) { @@ -44,10 +46,10 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) feature = smccc_get_arg1(vcpu); switch (feature) { case KVM_PTP_VIRT_COUNTER: - cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2); + cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.voffset; break; case KVM_PTP_PHYS_COUNTER: - cycles = systime_snapshot.cycles; + cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.poffset; break; default: return; @@ -65,7 +67,7 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) val[3] = lower_32_bits(cycles); } -static bool kvm_hvc_call_default_allowed(u32 func_id) +static bool kvm_smccc_default_allowed(u32 func_id) { switch (func_id) { /* @@ -93,7 +95,7 @@ static bool kvm_hvc_call_default_allowed(u32 func_id) } } -static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id) +static bool kvm_smccc_test_fw_bmap(struct kvm_vcpu *vcpu, u32 func_id) { struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; @@ -117,20 +119,172 @@ static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id) return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP, &smccc_feat->vendor_hyp_bmap); default: - return kvm_hvc_call_default_allowed(func_id); + return false; } } -int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) +#define SMC32_ARCH_RANGE_BEGIN ARM_SMCCC_VERSION_FUNC_ID +#define SMC32_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + 0, ARM_SMCCC_FUNC_MASK) + +#define SMC64_ARCH_RANGE_BEGIN ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + 0, 0) +#define SMC64_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + 0, ARM_SMCCC_FUNC_MASK) + +static int kvm_smccc_filter_insert_reserved(struct kvm *kvm) +{ + int r; + + /* + * Prevent userspace from handling any SMCCC calls in the architecture + * range, avoiding the risk of misrepresenting Spectre mitigation status + * to the guest. + */ + r = mtree_insert_range(&kvm->arch.smccc_filter, + SMC32_ARCH_RANGE_BEGIN, SMC32_ARCH_RANGE_END, + xa_mk_value(KVM_SMCCC_FILTER_HANDLE), + GFP_KERNEL_ACCOUNT); + if (r) + goto out_destroy; + + r = mtree_insert_range(&kvm->arch.smccc_filter, + SMC64_ARCH_RANGE_BEGIN, SMC64_ARCH_RANGE_END, + xa_mk_value(KVM_SMCCC_FILTER_HANDLE), + GFP_KERNEL_ACCOUNT); + if (r) + goto out_destroy; + + return 0; +out_destroy: + mtree_destroy(&kvm->arch.smccc_filter); + return r; +} + +static bool kvm_smccc_filter_configured(struct kvm *kvm) +{ + return !mtree_empty(&kvm->arch.smccc_filter); +} + +static int kvm_smccc_set_filter(struct kvm *kvm, struct kvm_smccc_filter __user *uaddr) +{ + const void *zero_page = page_to_virt(ZERO_PAGE(0)); + struct kvm_smccc_filter filter; + u32 start, end; + int r; + + if (copy_from_user(&filter, uaddr, sizeof(filter))) + return -EFAULT; + + if (memcmp(filter.pad, zero_page, sizeof(filter.pad))) + return -EINVAL; + + start = filter.base; + end = start + filter.nr_functions - 1; + + if (end < start || filter.action >= NR_SMCCC_FILTER_ACTIONS) + return -EINVAL; + + mutex_lock(&kvm->arch.config_lock); + + if (kvm_vm_has_ran_once(kvm)) { + r = -EBUSY; + goto out_unlock; + } + + if (!kvm_smccc_filter_configured(kvm)) { + r = kvm_smccc_filter_insert_reserved(kvm); + if (WARN_ON_ONCE(r)) + goto out_unlock; + } + + r = mtree_insert_range(&kvm->arch.smccc_filter, start, end, + xa_mk_value(filter.action), GFP_KERNEL_ACCOUNT); +out_unlock: + mutex_unlock(&kvm->arch.config_lock); + return r; +} + +static u8 kvm_smccc_filter_get_action(struct kvm *kvm, u32 func_id) +{ + unsigned long idx = func_id; + void *val; + + if (!kvm_smccc_filter_configured(kvm)) + return KVM_SMCCC_FILTER_HANDLE; + + /* + * But where's the error handling, you say? + * + * mt_find() returns NULL if no entry was found, which just so happens + * to match KVM_SMCCC_FILTER_HANDLE. + */ + val = mt_find(&kvm->arch.smccc_filter, &idx, idx); + return xa_to_value(val); +} + +static u8 kvm_smccc_get_action(struct kvm_vcpu *vcpu, u32 func_id) +{ + /* + * Intervening actions in the SMCCC filter take precedence over the + * pseudo-firmware register bitmaps. + */ + u8 action = kvm_smccc_filter_get_action(vcpu->kvm, func_id); + if (action != KVM_SMCCC_FILTER_HANDLE) + return action; + + if (kvm_smccc_test_fw_bmap(vcpu, func_id) || + kvm_smccc_default_allowed(func_id)) + return KVM_SMCCC_FILTER_HANDLE; + + return KVM_SMCCC_FILTER_DENY; +} + +static void kvm_prepare_hypercall_exit(struct kvm_vcpu *vcpu, u32 func_id) +{ + u8 ec = ESR_ELx_EC(kvm_vcpu_get_esr(vcpu)); + struct kvm_run *run = vcpu->run; + u64 flags = 0; + + if (ec == ESR_ELx_EC_SMC32 || ec == ESR_ELx_EC_SMC64) + flags |= KVM_HYPERCALL_EXIT_SMC; + + if (!kvm_vcpu_trap_il_is32bit(vcpu)) + flags |= KVM_HYPERCALL_EXIT_16BIT; + + run->exit_reason = KVM_EXIT_HYPERCALL; + run->hypercall = (typeof(run->hypercall)) { + .nr = func_id, + .flags = flags, + }; +} + +int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) { struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; u32 func_id = smccc_get_function(vcpu); u64 val[4] = {SMCCC_RET_NOT_SUPPORTED}; u32 feature; + u8 action; gpa_t gpa; + uuid_t uuid; - if (!kvm_hvc_call_allowed(vcpu, func_id)) + action = kvm_smccc_get_action(vcpu, func_id); + switch (action) { + case KVM_SMCCC_FILTER_HANDLE: + break; + case KVM_SMCCC_FILTER_DENY: goto out; + case KVM_SMCCC_FILTER_FWD_TO_USER: + kvm_prepare_hypercall_exit(vcpu, func_id); + return 0; + default: + WARN_RATELIMIT(1, "Unhandled SMCCC filter action: %d\n", action); + goto out; + } switch (func_id) { case ARM_SMCCC_VERSION_FUNC_ID: @@ -166,7 +320,7 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) * to the guest, and hide SSBS so that the * guest stays protected. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) break; fallthrough; case SPECTRE_UNAFFECTED: @@ -198,17 +352,20 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) break; case ARM_SMCCC_HV_PV_TIME_ST: gpa = kvm_init_stolen_time(vcpu); - if (gpa != GPA_INVALID) + if (gpa != INVALID_GPA) val[0] = gpa; break; case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: - val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0; - val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1; - val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2; - val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3; + uuid = ARM_SMCCC_VENDOR_HYP_UID_KVM; + val[0] = smccc_uuid_to_reg(&uuid, 0); + val[1] = smccc_uuid_to_reg(&uuid, 1); + val[2] = smccc_uuid_to_reg(&uuid, 2); + val[3] = smccc_uuid_to_reg(&uuid, 3); break; case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: val[0] = smccc_feat->vendor_hyp_bmap; + /* Function numbers 2-63 are reserved for pKVM for now */ + val[2] = smccc_feat->vendor_hyp_bmap_2; break; case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: kvm_ptp_get_time(vcpu, val); @@ -236,6 +393,7 @@ static const u64 kvm_arm_fw_reg_ids[] = { KVM_REG_ARM_STD_BMAP, KVM_REG_ARM_STD_HYP_BMAP, KVM_REG_ARM_VENDOR_HYP_BMAP, + KVM_REG_ARM_VENDOR_HYP_BMAP_2, }; void kvm_arm_init_hypercalls(struct kvm *kvm) @@ -245,6 +403,13 @@ void kvm_arm_init_hypercalls(struct kvm *kvm) smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES; smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES; smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; + + mt_init(&kvm->arch.smccc_filter); +} + +void kvm_arm_teardown_hypercalls(struct kvm *kvm) +{ + mtree_destroy(&kvm->arch.smccc_filter); } int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) @@ -270,7 +435,7 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) * Convert the workaround level into an easy-to-compare number, where higher * values mean better protection. */ -static int get_kernel_wa_level(u64 regid) +static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid) { switch (regid) { case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: @@ -291,7 +456,7 @@ static int get_kernel_wa_level(u64 regid) * don't have any FW mitigation if SSBS is there at * all times. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; fallthrough; case SPECTRE_UNAFFECTED: @@ -328,7 +493,7 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK; break; case KVM_REG_ARM_STD_BMAP: val = READ_ONCE(smccc_feat->std_bmap); @@ -339,6 +504,9 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM_VENDOR_HYP_BMAP: val = READ_ONCE(smccc_feat->vendor_hyp_bmap); break; + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: + val = READ_ONCE(smccc_feat->vendor_hyp_bmap_2); + break; default: return -ENOENT; } @@ -369,6 +537,10 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val) fw_reg_bmap = &smccc_feat->vendor_hyp_bmap; fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; break; + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: + fw_reg_bmap = &smccc_feat->vendor_hyp_bmap_2; + fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2; + break; default: return -ENOENT; } @@ -377,17 +549,16 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val) if (val & ~fw_reg_features) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); - if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) && - val != *fw_reg_bmap) { + if (kvm_vm_has_ran_once(kvm) && val != *fw_reg_bmap) { ret = -EBUSY; goto out; } WRITE_ONCE(*fw_reg_bmap, val); out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); return ret; } @@ -397,6 +568,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) u64 val; int wa_level; + if (KVM_REG_SIZE(reg->id) != sizeof(val)) + return -ENOENT; if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) return -EFAULT; @@ -405,7 +578,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { bool wants_02; - wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); + wants_02 = vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2); switch (val) { case KVM_ARM_PSCI_0_1: @@ -416,6 +589,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_ARM_PSCI_0_2: case KVM_ARM_PSCI_1_0: case KVM_ARM_PSCI_1_1: + case KVM_ARM_PSCI_1_2: + case KVM_ARM_PSCI_1_3: if (!wants_02) return -EINVAL; vcpu->kvm->arch.psci_version = val; @@ -429,7 +604,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (val & ~KVM_REG_FEATURE_LEVEL_MASK) return -EINVAL; - if (get_kernel_wa_level(reg->id) < val) + if (get_kernel_wa_level(vcpu, reg->id) < val) return -EINVAL; return 0; @@ -465,13 +640,14 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the * other way around. */ - if (get_kernel_wa_level(reg->id) < wa_level) + if (get_kernel_wa_level(vcpu, reg->id) < wa_level) return -EINVAL; return 0; case KVM_REG_ARM_STD_BMAP: case KVM_REG_ARM_STD_HYP_BMAP: case KVM_REG_ARM_VENDOR_HYP_BMAP: + case KVM_REG_ARM_VENDOR_HYP_BMAP_2: return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val); default: return -ENOENT; @@ -479,3 +655,25 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) return -EINVAL; } + +int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VM_SMCCC_FILTER: + return 0; + default: + return -ENXIO; + } +} + +int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + void __user *uaddr = (void __user *)attr->addr; + + switch (attr->attr) { + case KVM_ARM_VM_SMCCC_FILTER: + return kvm_smccc_set_filter(kvm, uaddr); + default: + return -ENXIO; + } +} diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index f32f4a2a347f..dfcd66c65517 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -12,17 +12,130 @@ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> #include <asm/esr.h> +static unsigned int exception_target_el(struct kvm_vcpu *vcpu) +{ + /* If not nesting, EL1 is the only possible exception target */ + if (likely(!vcpu_has_nv(vcpu))) + return PSR_MODE_EL1h; + + /* + * With NV, we need to pick between EL1 and EL2. Note that we + * never deal with a nesting exception here, hence never + * changing context, and the exception itself can be delayed + * until the next entry. + */ + switch(*vcpu_cpsr(vcpu) & PSR_MODE_MASK) { + case PSR_MODE_EL2h: + case PSR_MODE_EL2t: + return PSR_MODE_EL2h; + case PSR_MODE_EL1h: + case PSR_MODE_EL1t: + return PSR_MODE_EL1h; + case PSR_MODE_EL0t: + return vcpu_el2_tge_is_set(vcpu) ? PSR_MODE_EL2h : PSR_MODE_EL1h; + default: + BUG(); + } +} + +static enum vcpu_sysreg exception_esr_elx(struct kvm_vcpu *vcpu) +{ + if (exception_target_el(vcpu) == PSR_MODE_EL2h) + return ESR_EL2; + + return ESR_EL1; +} + +static enum vcpu_sysreg exception_far_elx(struct kvm_vcpu *vcpu) +{ + if (exception_target_el(vcpu) == PSR_MODE_EL2h) + return FAR_EL2; + + return FAR_EL1; +} + +static void pend_sync_exception(struct kvm_vcpu *vcpu) +{ + if (exception_target_el(vcpu) == PSR_MODE_EL1h) + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + else + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC); +} + +static void pend_serror_exception(struct kvm_vcpu *vcpu) +{ + if (exception_target_el(vcpu) == PSR_MODE_EL1h) + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SERR); + else + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR); +} + +static bool __effective_sctlr2_bit(struct kvm_vcpu *vcpu, unsigned int idx) +{ + u64 sctlr2; + + if (!kvm_has_sctlr2(vcpu->kvm)) + return false; + + if (is_nested_ctxt(vcpu) && + !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_SCTLR2En)) + return false; + + if (exception_target_el(vcpu) == PSR_MODE_EL1h) + sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL1); + else + sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL2); + + return sctlr2 & BIT(idx); +} + +static bool effective_sctlr2_ease(struct kvm_vcpu *vcpu) +{ + return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_EASE_SHIFT); +} + +static bool effective_sctlr2_nmea(struct kvm_vcpu *vcpu) +{ + return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_NMEA_SHIFT); +} + static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) { unsigned long cpsr = *vcpu_cpsr(vcpu); bool is_aarch32 = vcpu_mode_is_32bit(vcpu); - u64 esr = 0; + u64 esr = 0, fsc; + int level; - kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + /* + * If injecting an abort from a failed S1PTW, rewalk the S1 PTs to + * find the failing level. If we can't find it, assume the error was + * transient and restart without changing the state. + */ + if (kvm_vcpu_abt_iss1tw(vcpu)) { + u64 hpfar = kvm_vcpu_get_fault_ipa(vcpu); + int ret; + + if (hpfar == INVALID_GPA) + return; + + ret = __kvm_find_s1_desc_level(vcpu, addr, hpfar, &level); + if (ret) + return; + + WARN_ON_ONCE(level < -1 || level > 3); + fsc = ESR_ELx_FSC_SEA_TTW(level); + } else { + fsc = ESR_ELx_FSC_EXTABT; + } - vcpu_write_sys_reg(vcpu, addr, FAR_EL1); + /* This delight is brought to you by FEAT_DoubleFault2. */ + if (effective_sctlr2_ease(vcpu)) + pend_serror_exception(vcpu); + else + pend_sync_exception(vcpu); /* * Build an {i,d}abort, depending on the level and the @@ -43,14 +156,17 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr if (!is_iabt) esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT; - vcpu_write_sys_reg(vcpu, esr | ESR_ELx_FSC_EXTABT, ESR_EL1); + esr |= fsc; + + vcpu_write_sys_reg(vcpu, addr, exception_far_elx(vcpu)); + vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); } static void inject_undef64(struct kvm_vcpu *vcpu) { u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + pend_sync_exception(vcpu); /* * Build an unknown exception, depending on the instruction @@ -59,7 +175,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) if (kvm_vcpu_trap_il_is32bit(vcpu)) esr |= ESR_ELx_IL; - vcpu_write_sys_reg(vcpu, esr, ESR_EL1); + vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); } #define DFSR_FSC_EXTABT_LPAE 0x10 @@ -85,7 +201,7 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) if (vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE) { fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; } else { - /* no need to shuffle FS[4] into DFSR[10] as its 0 */ + /* no need to shuffle FS[4] into DFSR[10] as it's 0 */ fsr = DFSR_FSC_EXTABT_nLPAE; } @@ -106,36 +222,35 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) vcpu_write_sys_reg(vcpu, far, FAR_EL1); } -/** - * kvm_inject_dabt - inject a data abort into the guest - * @vcpu: The VCPU to receive the data abort - * @addr: The address to report in the DFAR - * - * It is assumed that this code is called from the VCPU thread and that the - * VCPU therefore is not currently executing guest code. - */ -void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) +static void __kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr) { if (vcpu_el1_is_32bit(vcpu)) - inject_abt32(vcpu, false, addr); + inject_abt32(vcpu, iabt, addr); else - inject_abt64(vcpu, false, addr); + inject_abt64(vcpu, iabt, addr); } -/** - * kvm_inject_pabt - inject a prefetch abort into the guest - * @vcpu: The VCPU to receive the prefetch abort - * @addr: The address to report in the DFAR - * - * It is assumed that this code is called from the VCPU thread and that the - * VCPU therefore is not currently executing guest code. - */ -void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) +static bool kvm_sea_target_is_el2(struct kvm_vcpu *vcpu) { - if (vcpu_el1_is_32bit(vcpu)) - inject_abt32(vcpu, true, addr); - else - inject_abt64(vcpu, true, addr); + if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_TGE | HCR_TEA)) + return true; + + if (!vcpu_mode_priv(vcpu)) + return false; + + return (*vcpu_cpsr(vcpu) & PSR_A_BIT) && + (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA); +} + +int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr) +{ + lockdep_assert_held(&vcpu->mutex); + + if (is_nested_ctxt(vcpu) && kvm_sea_target_is_el2(vcpu)) + return kvm_inject_nested_sea(vcpu, iabt, addr); + + __kvm_inject_sea(vcpu, iabt, addr); + return 1; } void kvm_inject_size_fault(struct kvm_vcpu *vcpu) @@ -145,25 +260,22 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu) addr = kvm_vcpu_get_fault_ipa(vcpu); addr |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); - if (kvm_vcpu_trap_is_iabt(vcpu)) - kvm_inject_pabt(vcpu, addr); - else - kvm_inject_dabt(vcpu, addr); + __kvm_inject_sea(vcpu, kvm_vcpu_trap_is_iabt(vcpu), addr); /* * If AArch64 or LPAE, set FSC to 0 to indicate an Address * Size Fault at level 0, as if exceeding PARange. * * Non-LPAE guests will only get the external abort, as there - * is no way to to describe the ASF. + * is no way to describe the ASF. */ if (vcpu_el1_is_32bit(vcpu) && !(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE)) return; - esr = vcpu_read_sys_reg(vcpu, ESR_EL1); + esr = vcpu_read_sys_reg(vcpu, exception_esr_elx(vcpu)); esr &= ~GENMASK_ULL(5, 0); - vcpu_write_sys_reg(vcpu, esr, ESR_EL1); + vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); } /** @@ -181,25 +293,70 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu) inject_undef64(vcpu); } -void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 esr) +static bool serror_is_masked(struct kvm_vcpu *vcpu) { - vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK); - *vcpu_hcr(vcpu) |= HCR_VSE; + return (*vcpu_cpsr(vcpu) & PSR_A_BIT) && !effective_sctlr2_nmea(vcpu); } -/** - * kvm_inject_vabt - inject an async abort / SError into the guest - * @vcpu: The VCPU to receive the exception - * - * It is assumed that this code is called from the VCPU thread and that the - * VCPU therefore is not currently executing guest code. - * - * Systems with the RAS Extensions specify an imp-def ESR (ISV/IDS = 1) with - * the remaining ISS all-zeros so that this error is not interpreted as an - * uncategorized RAS error. Without the RAS Extensions we can't specify an ESR - * value, so the CPU generates an imp-def value. - */ -void kvm_inject_vabt(struct kvm_vcpu *vcpu) +static bool kvm_serror_target_is_el2(struct kvm_vcpu *vcpu) { - kvm_set_sei_esr(vcpu, ESR_ELx_ISV); + if (is_hyp_ctxt(vcpu) || vcpu_el2_amo_is_set(vcpu)) + return true; + + if (!(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA)) + return false; + + /* + * In another example where FEAT_DoubleFault2 is entirely backwards, + * "masked" as it relates to the routing effects of HCRX_EL2.TMEA + * doesn't consider SCTLR2_EL1.NMEA. That is to say, even if EL1 asked + * for non-maskable SErrors, the EL2 bit takes priority if A is set. + */ + if (vcpu_mode_priv(vcpu)) + return *vcpu_cpsr(vcpu) & PSR_A_BIT; + + /* + * Otherwise SErrors are considered unmasked when taken from EL0 and + * NMEA is set. + */ + return serror_is_masked(vcpu); +} + +static bool kvm_serror_undeliverable_at_el2(struct kvm_vcpu *vcpu) +{ + return !(vcpu_el2_tge_is_set(vcpu) || vcpu_el2_amo_is_set(vcpu)); +} + +int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr) +{ + lockdep_assert_held(&vcpu->mutex); + + if (is_nested_ctxt(vcpu) && kvm_serror_target_is_el2(vcpu)) + return kvm_inject_nested_serror(vcpu, esr); + + if (vcpu_is_el2(vcpu) && kvm_serror_undeliverable_at_el2(vcpu)) { + vcpu_set_vsesr(vcpu, esr); + vcpu_set_flag(vcpu, NESTED_SERROR_PENDING); + return 1; + } + + /* + * Emulate the exception entry if SErrors are unmasked. This is useful if + * the vCPU is in a nested context w/ vSErrors enabled then we've already + * delegated he hardware vSError context (i.e. HCR_EL2.VSE, VSESR_EL2, + * VDISR_EL2) to the guest hypervisor. + * + * As we're emulating the SError injection we need to explicitly populate + * ESR_ELx.EC because hardware will not do it on our behalf. + */ + if (!serror_is_masked(vcpu)) { + pend_serror_exception(vcpu); + esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR); + vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); + return 1; + } + + vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK); + *vcpu_hcr(vcpu) |= HCR_VSE; + return 1; } diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 3dd38a151d2a..54f9358c9e0e 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -72,6 +72,33 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) return data; } +static bool kvm_pending_external_abort(struct kvm_vcpu *vcpu) +{ + if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) + return false; + + if (vcpu_el1_is_32bit(vcpu)) { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA32_UND): + case unpack_vcpu_flag(EXCEPT_AA32_IABT): + case unpack_vcpu_flag(EXCEPT_AA32_DABT): + return true; + default: + return false; + } + } else { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR): + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR): + return true; + default: + return false; + } + } +} + /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation * or in-kernel IO emulation @@ -84,9 +111,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) unsigned int len; int mask; - /* Detect an already handled MMIO return */ - if (unlikely(!vcpu->mmio_needed)) - return 0; + /* + * Detect if the MMIO return was already handled or if userspace aborted + * the MMIO access. + */ + if (unlikely(!vcpu->mmio_needed || kvm_pending_external_abort(vcpu))) + return 1; vcpu->mmio_needed = 0; @@ -117,7 +147,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) */ kvm_incr_pc(vcpu); - return 0; + return 1; } int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) @@ -133,8 +163,17 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) /* * No valid syndrome? Ask userspace for help if it has * volunteered to do so, and bail out otherwise. + * + * In the protected VM case, there isn't much userspace can do + * though, so directly deliver an exception to the guest. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { + trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), + kvm_vcpu_get_hfar(vcpu), fault_ipa); + + if (vcpu_is_protected(vcpu)) + return kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; @@ -143,7 +182,6 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) return 0; } - kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n"); return -ENOSYS; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a3ee3b605c9b..48d7c372a4cd 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -4,18 +4,20 @@ * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ +#include <linux/acpi.h> #include <linux/mman.h> #include <linux/kvm_host.h> #include <linux/io.h> #include <linux/hugetlb.h> #include <linux/sched/signal.h> #include <trace/events/kvm.h> +#include <asm/acpi.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> -#include <asm/kvm_ras.h> +#include <asm/kvm_pkvm.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/virt.h> @@ -25,20 +27,31 @@ static struct kvm_pgtable *hyp_pgtable; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); -static unsigned long hyp_idmap_start; -static unsigned long hyp_idmap_end; -static phys_addr_t hyp_idmap_vector; +static unsigned long __ro_after_init hyp_idmap_start; +static unsigned long __ro_after_init hyp_idmap_end; +static phys_addr_t __ro_after_init hyp_idmap_vector; -static unsigned long io_map_base; +u32 __ro_after_init __hyp_va_bits; -static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +static unsigned long __ro_after_init io_map_base; + +#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) + +static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, + phys_addr_t size) { - phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); phys_addr_t boundary = ALIGN_DOWN(addr + size, size); return (boundary - 1 < end - 1) ? boundary : end; } +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + + return __stage2_range_addr_end(addr, end, size); +} + /* * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, @@ -46,16 +59,17 @@ static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) * long will also starve other vCPUs. We have to also make sure that the page * tables are not freed while we released the lock. */ -static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, +static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end, int (*fn)(struct kvm_pgtable *, u64, u64), bool resched) { + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); int ret; u64 next; do { - struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; + struct kvm_pgtable *pgt = mmu->pgt; if (!pgt) return -EINVAL; @@ -71,8 +85,81 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, return ret; } -#define stage2_apply_range_resched(kvm, addr, end, fn) \ - stage2_apply_range(kvm, addr, end, fn, true) +#define stage2_apply_range_resched(mmu, addr, end, fn) \ + stage2_apply_range(mmu, addr, end, fn, true) + +/* + * Get the maximum number of page-tables pages needed to split a range + * of blocks into PAGE_SIZE PTEs. It assumes the range is already + * mapped at level 2, or at level 1 if allowed. + */ +static int kvm_mmu_split_nr_page_tables(u64 range) +{ + int n = 0; + + if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) + n += DIV_ROUND_UP(range, PUD_SIZE); + n += DIV_ROUND_UP(range, PMD_SIZE); + return n; +} + +static bool need_split_memcache_topup_or_resched(struct kvm *kvm) +{ + struct kvm_mmu_memory_cache *cache; + u64 chunk_size, min; + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; + + chunk_size = kvm->arch.mmu.split_page_chunk_size; + min = kvm_mmu_split_nr_page_tables(chunk_size); + cache = &kvm->arch.mmu.split_page_cache; + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, + phys_addr_t end) +{ + struct kvm_mmu_memory_cache *cache; + struct kvm_pgtable *pgt; + int ret, cache_capacity; + u64 next, chunk_size; + + lockdep_assert_held_write(&kvm->mmu_lock); + + chunk_size = kvm->arch.mmu.split_page_chunk_size; + cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); + + if (chunk_size == 0) + return 0; + + cache = &kvm->arch.mmu.split_page_cache; + + do { + if (need_split_memcache_topup_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* Eager page splitting is best-effort. */ + ret = __kvm_mmu_topup_memory_cache(cache, + cache_capacity, + cache_capacity); + write_lock(&kvm->mmu_lock); + if (ret) + break; + } + + pgt = kvm->arch.mmu.pgt; + if (!pgt) + return -EINVAL; + + next = __stage2_range_addr_end(addr, end, chunk_size); + ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); + if (ret) + break; + } while (addr = next, addr != end); + + return ret; +} static bool memslot_is_logging(struct kvm_memory_slot *memslot) { @@ -80,20 +167,31 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) } /** - * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 * @kvm: pointer to kvm structure. * * Interface to HYP function to flush all VM TLB entries */ -void kvm_flush_remote_tlbs(struct kvm *kvm) +int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { - ++kvm->stat.generic.remote_tlb_flush_requests; - kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); + return 0; } -static bool kvm_is_device_pfn(unsigned long pfn) +int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, + gfn_t gfn, u64 nr_pages) { - return !pfn_is_map_memory(pfn); + u64 size = nr_pages << PAGE_SHIFT; + u64 addr = gfn << PAGE_SHIFT; + + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); + return 0; } static void *stage2_memcache_zalloc_page(void *arg) @@ -130,21 +228,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size) static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; -static void stage2_free_removed_table_rcu_cb(struct rcu_head *head) +static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) { struct page *page = container_of(head, struct page, rcu_head); void *pgtable = page_to_virt(page); - u32 level = page_private(page); + s8 level = page_private(page); - kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level); + KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); } -static void stage2_free_removed_table(void *addr, u32 level) +static void stage2_free_unlinked_table(void *addr, s8 level) { struct page *page = virt_to_page(addr); set_page_private(page, (unsigned long)level); - call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb); + call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); } static void kvm_host_get_page(void *addr) @@ -216,7 +314,7 @@ static void invalidate_icache_guest_page(void *va, size_t size) * does. */ /** - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range * @mmu: The KVM stage-2 MMU pointer * @start: The intermediate physical base address of the range to unmap * @size: The size of the area to unmap @@ -235,13 +333,19 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap, + WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), may_block)); } -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block) { - __unmap_stage2_range(mmu, start, size, true); + __unmap_stage2_range(mmu, start, size, may_block); +} + +void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) +{ + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); } static void stage2_flush_memslot(struct kvm *kvm, @@ -250,7 +354,7 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush); + kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); } /** @@ -273,6 +377,8 @@ static void stage2_flush_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_flush_memslot(kvm, memslot); + kvm_nested_s2_flush(kvm); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } @@ -280,7 +386,7 @@ static void stage2_flush_vm(struct kvm *kvm) /** * free_hyp_pgds - free Hyp-mode page tables */ -void free_hyp_pgds(void) +void __init free_hyp_pgds(void) { mutex_lock(&kvm_hyp_pgd_mutex); if (hyp_pgtable) { @@ -511,6 +617,25 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return 0; } +static int __hyp_alloc_private_va_range(unsigned long base) +{ + lockdep_assert_held(&kvm_hyp_pgd_mutex); + + if (!PAGE_ALIGNED(base)) + return -EINVAL; + + /* + * Verify that BIT(VA_BITS - 1) hasn't been flipped by + * allocating the new area, as it would indicate we've + * overflowed the idmap/IO address range. + */ + if ((base ^ io_map_base) & BIT(VA_BITS - 1)) + return -ENOMEM; + + io_map_base = base; + + return 0; +} /** * hyp_alloc_private_va_range - Allocates a private VA range. @@ -531,29 +656,22 @@ int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) /* * This assumes that we have enough space below the idmap - * page to allocate our VAs. If not, the check below will - * kick. A potential alternative would be to detect that - * overflow and switch to an allocation above the idmap. + * page to allocate our VAs. If not, the check in + * __hyp_alloc_private_va_range() will kick. A potential + * alternative would be to detect that overflow and switch + * to an allocation above the idmap. * * The allocated size is always a multiple of PAGE_SIZE. */ - base = io_map_base - PAGE_ALIGN(size); - - /* Align the allocation based on the order of its size */ - base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size)); - - /* - * Verify that BIT(VA_BITS - 1) hasn't been flipped by - * allocating the new area, as it would indicate we've - * overflowed the idmap/IO address range. - */ - if ((base ^ io_map_base) & BIT(VA_BITS - 1)) - ret = -ENOMEM; - else - *haddr = io_map_base = base; + size = PAGE_ALIGN(size); + base = io_map_base - size; + ret = __hyp_alloc_private_va_range(base); mutex_unlock(&kvm_hyp_pgd_mutex); + if (!ret) + *haddr = base; + return ret; } @@ -587,6 +705,48 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, return ret; } +int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) +{ + unsigned long base; + size_t size; + int ret; + + mutex_lock(&kvm_hyp_pgd_mutex); + /* + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies + * an alignment of our allocation on the order of the size. + */ + size = NVHE_STACK_SIZE * 2; + base = ALIGN_DOWN(io_map_base - size, size); + + ret = __hyp_alloc_private_va_range(base); + + mutex_unlock(&kvm_hyp_pgd_mutex); + + if (ret) { + kvm_err("Cannot allocate hyp stack guard page\n"); + return ret; + } + + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. + */ + ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, + phys_addr, PAGE_HYP); + if (ret) + kvm_err("Cannot map hyp stack\n"); + + *haddr = base + size; + + return ret; +} + /** * create_hyp_io_mappings - Map IO into both kernel and HYP * @phys_addr: The physical start address which gets mapped @@ -661,18 +821,39 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) struct kvm_pgtable pgt = { .pgd = (kvm_pteref_t)kvm->mm->pgd, .ia_bits = vabits_actual, - .start_level = (KVM_PGTABLE_MAX_LEVELS - - CONFIG_PGTABLE_LEVELS), + .start_level = (KVM_PGTABLE_LAST_LEVEL - + ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), .mm_ops = &kvm_user_mm_ops, }; + unsigned long flags; kvm_pte_t pte = 0; /* Keep GCC quiet... */ - u32 level = ~0; + s8 level = S8_MAX; int ret; + /* + * Disable IRQs so that we hazard against a concurrent + * teardown of the userspace page tables (which relies on + * IPI-ing threads). + */ + local_irq_save(flags); ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); - VM_BUG_ON(ret); - VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); - VM_BUG_ON(!(pte & PTE_VALID)); + local_irq_restore(flags); + + if (ret) + return ret; + + /* + * Not seeing an error, but not updating level? Something went + * deeply wrong... + */ + if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) + return -EFAULT; + if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) + return -EFAULT; + + /* Oops, the userspace PTs are gone... Replay the fault */ + if (!kvm_pte_valid(pte)) + return -EAGAIN; return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); } @@ -681,7 +862,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, .free_pages_exact = kvm_s2_free_pages_exact, - .free_removed_table = stage2_free_removed_table, + .free_unlinked_table = stage2_free_unlinked_table, .get_page = kvm_host_get_page, .put_page = kvm_s2_put_page, .page_count = kvm_host_page_count, @@ -691,21 +872,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .icache_inval_pou = invalidate_icache_guest_page, }; -/** - * kvm_init_stage2_mmu - Initialise a S2 MMU structure - * @kvm: The pointer to the KVM structure - * @mmu: The pointer to the s2 MMU structure - * @type: The machine type of the virtual machine - * - * Allocates only the stage-2 HW PGD level table(s). - * Note we don't need locking here as this is only called when the VM is - * created, which can only be done once. - */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) { u32 kvm_ipa_limit = get_kvm_ipa_limit(); - int cpu, err; - struct kvm_pgtable *pgt; u64 mmfr0, mmfr1; u32 phys_shift; @@ -730,22 +899,98 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); + mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); + + return 0; +} + +/* + * Assume that @pgt is valid and unlinked from the KVM MMU to free the + * page-table without taking the kvm_mmu_lock and without performing any + * TLB invalidations. + * + * Also, the range of addresses can be large enough to cause need_resched + * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke + * cond_resched() periodically to prevent hogging the CPU for a long time + * and schedule something else, if required. + */ +static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, + phys_addr_t end) +{ + u64 next; + do { + next = stage2_range_addr_end(addr, end); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, + next - addr); + if (next != end) + cond_resched(); + } while (addr = next, addr != end); +} + +static void kvm_stage2_destroy(struct kvm_pgtable *pgt) +{ + unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); + + stage2_destroy_range(pgt, 0, BIT(ia_bits)); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); +} + +/** + * kvm_init_stage2_mmu - Initialise a S2 MMU structure + * @kvm: The pointer to the KVM structure + * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine + * + * Allocates only the stage-2 HW PGD level table(s). + * Note we don't need locking here as this is only called in two cases: + * + * - when the VM is created, which can't race against anything + * + * - when secondary kvm_s2_mmu structures are initialised for NV + * guests, and the caller must hold kvm->lock as this is called on a + * per-vcpu basis. + */ +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +{ + int cpu, err; + struct kvm_pgtable *pgt; + + /* + * If we already have our page tables in place, and that the + * MMU context is the canonical one, we have a bug somewhere, + * as this is only supposed to ever happen once per VM. + * + * Otherwise, we're building nested page tables, and that's + * probably because userspace called KVM_ARM_VCPU_INIT more + * than once on the same vcpu. Since that's actually legal, + * don't kick a fuss and leave gracefully. + */ if (mmu->pgt != NULL) { + if (kvm_is_nested_s2_mmu(kvm, mmu)) + return 0; + kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } + err = kvm_init_ipa_range(mmu, type); + if (err) + return err; + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; mmu->arch = &kvm->arch; - err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops); + err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); if (err) goto out_free_pgtable; + mmu->pgt = pgt; + if (is_protected_kvm_enabled()) + return 0; + mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); if (!mmu->last_vcpu_ran) { err = -ENOMEM; @@ -755,17 +1000,30 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; - mmu->pgt = pgt; + /* The eager page splitting is disabled by default */ + mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + mmu->split_page_cache.gfp_zero = __GFP_ZERO; + mmu->pgd_phys = __pa(pgt->pgd); + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + return 0; out_destroy_pgtable: - kvm_pgtable_stage2_destroy(pgt); + kvm_stage2_destroy(pgt); out_free_pgtable: kfree(pgt); return err; } +void kvm_uninit_stage2_mmu(struct kvm *kvm) +{ + kvm_free_stage2_pgd(&kvm->arch.mmu); + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); +} + static void stage2_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -802,7 +1060,7 @@ static void stage2_unmap_memslot(struct kvm *kvm, if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); } hva = vm_end; } while (hva < reg_end); @@ -829,6 +1087,8 @@ void stage2_unmap_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_unmap_memslot(kvm, memslot); + kvm_nested_s2_unmap(kvm, true); + write_unlock(&kvm->mmu_lock); mmap_read_unlock(current->mm); srcu_read_unlock(&kvm->srcu, idx); @@ -846,29 +1106,47 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); } + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + write_unlock(&kvm->mmu_lock); if (pgt) { - kvm_pgtable_stage2_destroy(pgt); + kvm_stage2_destroy(pgt); kfree(pgt); } } -static void hyp_mc_free_fn(void *addr, void *unused) +static void hyp_mc_free_fn(void *addr, void *mc) { + struct kvm_hyp_memcache *memcache = mc; + + if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, -1); + free_page((unsigned long)addr); } -static void *hyp_mc_alloc_fn(void *unused) +static void *hyp_mc_alloc_fn(void *mc) { - return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + struct kvm_hyp_memcache *memcache = mc; + void *addr; + + addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, 1); + + return addr; } void free_hyp_memcache(struct kvm_hyp_memcache *mc) { - if (is_protected_kvm_enabled()) - __free_hyp_memcache(mc, hyp_mc_free_fn, - kvm_host_va, NULL); + if (!is_protected_kvm_enabled()) + return; + + kfree(mc->mapping); + __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); } int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) @@ -876,8 +1154,14 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) if (!is_protected_kvm_enabled()) return 0; + if (!mc->mapping) { + mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); + if (!mc->mapping) + return -ENOMEM; + } + return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, - kvm_host_pa, NULL); + kvm_host_pa, mc); } /** @@ -895,7 +1179,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t addr; int ret = 0; struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; - struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct kvm_pgtable *pgt = mmu->pgt; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_R | (writable ? KVM_PGTABLE_PROT_W : 0); @@ -908,13 +1193,13 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { ret = kvm_mmu_topup_memory_cache(&cache, - kvm_mmu_cache_min_pages(kvm)); + kvm_mmu_cache_min_pages(mmu)); if (ret) break; write_lock(&kvm->mmu_lock); - ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, - &cache, 0); + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, + pa, prot, &cache, 0); write_unlock(&kvm->mmu_lock); if (ret) break; @@ -927,15 +1212,14 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, } /** - * stage2_wp_range() - write protect stage2 memory region range + * kvm_stage2_wp_range() - write protect stage2 memory region range * @mmu: The KVM stage-2 MMU pointer * @addr: Start address of range * @end: End address of range */ -static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) +void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); - stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect); + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); } /** @@ -964,45 +1248,75 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_nested_s2_wp(kvm); write_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_memslot(kvm, memslot); } /** - * kvm_mmu_write_protect_pt_masked() - write protect dirty pages + * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE + * pages for memory slot * @kvm: The KVM pointer - * @slot: The memory slot associated with mask - * @gfn_offset: The gfn offset in memory slot - * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory - * slot to be write protected + * @slot: The memory slot to split * - * Walks bits set in mask write protects the associated pte's. Caller must - * acquire kvm_mmu_lock. + * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. */ -static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) +static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) { - phys_addr_t base_gfn = slot->base_gfn + gfn_offset; - phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; - phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + phys_addr_t start, end; + + lockdep_assert_held(&kvm->slots_lock); - stage2_wp_range(&kvm->arch.mmu, start, end); + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, slot); + + start = memslot->base_gfn << PAGE_SHIFT; + end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + write_lock(&kvm->mmu_lock); + kvm_mmu_split_huge_pages(kvm, start, end); + write_unlock(&kvm->mmu_lock); } /* - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * dirty pages. + * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of pages at offset 'gfn_offset' in this memory + * slot to enable dirty logging on * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. + * Writes protect selected pages to enable dirty logging, and then + * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + lockdep_assert_held_write(&kvm->mmu_lock); + + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); + + /* + * Eager-splitting is done when manual-protect is set. We + * also check for initially-all-set because we can avoid + * eager-splitting if initially-all-set is false. + * Initially-all-set equal false implies that huge-pages were + * already split when enabling dirty logging: no need to do it + * again. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_split_huge_pages(kvm, start, end); + + kvm_nested_s2_wp(kvm); } static void kvm_send_hwpoison_signal(unsigned long address, short lsb) @@ -1022,6 +1336,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, if (map_size == PAGE_SIZE) return true; + /* pKVM only supports PMD_SIZE huge-mappings */ + if (is_protected_kvm_enabled() && map_size != PMD_SIZE) + return false; + size = memslot->npages * PAGE_SIZE; gpa_start = memslot->base_gfn << PAGE_SHIFT; @@ -1079,7 +1397,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * * Returns the size of the mapping. */ -static unsigned long +static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) @@ -1091,30 +1409,17 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && - get_user_mapping_size(kvm, hva) >= PMD_SIZE) { - /* - * The address we faulted on is backed by a transparent huge - * page. However, because we map the compound huge page and - * not the individual tail page, we need to transfer the - * refcount to the head page. We have to be careful that the - * THP doesn't start to split while we are adjusting the - * refcounts. - * - * We are sure this doesn't happen, because mmu_invalidate_retry - * was successful and we are holding the mmu_lock, so if this - * THP is trying to split, it will be blocked in the mmu - * notifier before touching any of the pages, specifically - * before being able to call __split_huge_page_refcount(). - * - * We can therefore safely transfer the refcount from PG_tail - * to PG_head and switch the pfn from a tail page to the head - * page accordingly. - */ + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + int sz = get_user_mapping_size(kvm, hva); + + if (sz < 0) + return sz; + + if (sz < PMD_SIZE) + return PAGE_SIZE; + *ipap &= PMD_MASK; - kvm_release_pfn_clean(pfn); pfn &= ~(PTRS_PER_PMD - 1); - get_page(pfn_to_page(pfn)); *pfnp = pfn; return PMD_SIZE; @@ -1158,21 +1463,29 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) * able to see the page's tags and therefore they must be initialised first. If * PG_mte_tagged is set, tags have already been initialised. * - * The race in the test/set of the PG_mte_tagged flag is handled by: - * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs - * racing to santise the same page - * - mmap_lock protects between a VM faulting a page in and the VMM performing - * an mprotect() to add VM_MTE + * Must be called with kvm->mmu_lock held to ensure the memory remains mapped + * while the tags are zeroed. */ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, unsigned long size) { unsigned long i, nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(pfn); + struct folio *folio = page_folio(page); if (!kvm_has_mte(kvm)) return; + if (folio_test_hugetlb(folio)) { + /* Hugetlb has MTE flags set on head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr_pages; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + return; + } + for (i = 0; i < nr_pages; i++, page++) { if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); @@ -1186,36 +1499,190 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) return vma->vm_flags & VM_MTE_ALLOWED; } +static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) +{ + switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { + case MT_NORMAL_NC: + case MT_DEVICE_nGnRnE: + case MT_DEVICE_nGnRE: + return false; + default: + return true; + } +} + +static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, + void **memcache) +{ + int min_pages; + + if (!is_protected_kvm_enabled()) + *memcache = &vcpu->arch.mmu_page_cache; + else + *memcache = &vcpu->arch.pkvm_memcache; + + if (!topup_memcache) + return 0; + + min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); + + if (!is_protected_kvm_enabled()) + return kvm_mmu_topup_memory_cache(*memcache, min_pages); + + return topup_hyp_memcache(*memcache, min_pages); +} + +/* + * Potentially reduce shadow S2 permissions to match the guest's own S2. For + * exec faults, we'd only reach this point if the guest actually allowed it (see + * kvm_s2_handle_perm_fault). + * + * Also encode the level of the original translation in the SW bits of the leaf + * entry as a proxy for the span of that translation. This will be retrieved on + * TLB invalidation from the guest and used to limit the invalidation scope if a + * TTL hint or a range isn't provided. + */ +static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, + enum kvm_pgtable_prot *prot, + bool *writable) +{ + *writable &= kvm_s2_trans_writable(nested); + if (!kvm_s2_trans_readable(nested)) + *prot &= ~KVM_PGTABLE_PROT_R; + + *prot |= kvm_encode_nested_level(nested); +} + +static void adjust_nested_exec_perms(struct kvm *kvm, + struct kvm_s2_trans *nested, + enum kvm_pgtable_prot *prot) +{ + if (!kvm_s2_trans_exec_el0(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_UX; + if (!kvm_s2_trans_exec_el1(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_PX; +} + +#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED) + +static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, + struct kvm_memory_slot *memslot, bool is_perm) +{ + bool write_fault, exec_fault, writable; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; + unsigned long mmu_seq; + struct page *page; + struct kvm *kvm = vcpu->kvm; + void *memcache; + kvm_pfn_t pfn; + gfn_t gfn; + int ret; + + ret = prepare_mmu_memcache(vcpu, true, &memcache); + if (ret) + return ret; + + if (nested) + gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; + else + gfn = fault_ipa >> PAGE_SHIFT; + + write_fault = kvm_is_write_fault(vcpu); + exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); + + VM_WARN_ON_ONCE(write_fault && exec_fault); + + mmu_seq = kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); + if (ret) { + kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, + write_fault, exec_fault, false); + return ret; + } + + writable = !(memslot->flags & KVM_MEM_READONLY); + + if (nested) + adjust_nested_fault_perms(nested, &prot, &writable); + + if (writable) + prot |= KVM_PGTABLE_PROT_W; + + if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) + prot |= KVM_PGTABLE_PROT_X; + + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); + + kvm_fault_lock(kvm); + if (mmu_invalidate_retry(kvm, mmu_seq)) { + ret = -EAGAIN; + goto out_unlock; + } + + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, + __pfn_to_phys(pfn), prot, + memcache, flags); + +out_unlock: + kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_fault_unlock(kvm); + + if (writable && !ret) + mark_page_dirty_in_slot(kvm, memslot, gfn); + + return ret != -EAGAIN ? ret : 0; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, - unsigned long fault_status) + bool fault_is_perm) { int ret = 0; - bool write_fault, writable, force_pte = false; - bool exec_fault; - bool device = false; + bool topup_memcache; + bool write_fault, writable; + bool exec_fault, mte_allowed, is_vma_cacheable; + bool s2_force_noncacheable = false, vfio_allow_any_uc = false; unsigned long mmu_seq; + phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; short vma_shift; + void *memcache; gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); - unsigned long vma_pagesize, fault_granule; + bool force_pte = logging_active; + long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; + struct page *page; + vm_flags_t vm_flags; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; - fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level); + if (fault_is_perm) + fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); - VM_BUG_ON(write_fault && exec_fault); + VM_WARN_ON_ONCE(write_fault && exec_fault); - if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) { - kvm_err("Unexpected L2 read permission error\n"); - return -EFAULT; - } + /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + topup_memcache = !fault_is_perm || (logging_active && write_fault); + ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); + if (ret) + return ret; /* * Let's check if we will get back a huge page backed by hugetlbfs, or @@ -1229,16 +1696,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - /* - * logging_active is guaranteed to never be true for VM_PFNMAP - * memslots. - */ - if (logging_active) { - force_pte = true; + if (force_pte) vma_shift = PAGE_SHIFT; - } else { + else vma_shift = get_vma_page_shift(vma, hva); - } switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED @@ -1265,44 +1726,69 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } vma_pagesize = 1UL << vma_shift; - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) - fault_ipa &= ~(vma_pagesize - 1); - gfn = fault_ipa >> PAGE_SHIFT; - mmap_read_unlock(current->mm); + if (nested) { + unsigned long max_map_size; + + max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; + + ipa = kvm_s2_trans_output(nested); + + /* + * If we're about to create a shadow stage 2 entry, then we + * can only create a block mapping if the guest stage 2 page + * table uses at least as big a mapping. + */ + max_map_size = min(kvm_s2_trans_size(nested), max_map_size); + + /* + * Be careful that if the mapping size falls between + * two host sizes, take the smallest of the two. + */ + if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) + max_map_size = PMD_SIZE; + else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) + max_map_size = PAGE_SIZE; + + force_pte = (max_map_size == PAGE_SIZE); + vma_pagesize = min_t(long, vma_pagesize, max_map_size); + } /* - * Permission faults just need to update the existing leaf entry, - * and so normally don't require allocations from the memcache. The - * only exception to this is when dirty logging is enabled at runtime - * and a write fault needs to collapse a block entry into a table. + * Both the canonical IPA and fault IPA must be hugepage-aligned to + * ensure we find the right PFN and lay down the mapping in the right + * place. */ - if (fault_status != ESR_ELx_FSC_PERM || - (logging_active && write_fault)) { - ret = kvm_mmu_topup_memory_cache(memcache, - kvm_mmu_cache_min_pages(kvm)); - if (ret) - return ret; + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { + fault_ipa &= ~(vma_pagesize - 1); + ipa &= ~(vma_pagesize - 1); } - mmu_seq = vcpu->kvm->mmu_invalidate_seq; + gfn = ipa >> PAGE_SHIFT; + mte_allowed = kvm_vma_mte_allowed(vma); + + vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; + + vm_flags = vma->vm_flags; + + is_vma_cacheable = kvm_vma_is_cacheable(vma); + + /* Don't use the VMA after the unlock -- it may have vanished */ + vma = NULL; + /* - * Ensure the read of mmu_invalidate_seq happens before we call - * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk - * the page we just got a reference to gets unmapped before we have a - * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_gfn will take it away - * from us again properly. This smp_rmb() interacts with the smp_wmb() - * in kvm_mmu_notifier_invalidate_<page|range_end>. + * Read mmu_invalidate_seq so that KVM can detect if the results of + * vma_lookup() or __kvm_faultin_pfn() become stale prior to + * acquiring kvm->mmu_lock. * - * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is - * used to avoid unnecessary overhead introduced to locate the memory - * slot because it's always fixed even @gfn is adjusted for huge pages. + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). */ - smp_rmb(); + mmu_seq = kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, - write_fault, &writable, NULL); + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -1310,18 +1796,39 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (is_error_noslot_pfn(pfn)) return -EFAULT; - if (kvm_is_device_pfn(pfn)) { - /* - * If the page was identified as device early by looking at - * the VMA flags, vma_pagesize is already representing the - * largest quantity we can map. If instead it was mapped - * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE - * and must not be upgraded. - * - * In both cases, we don't let transparent_hugepage_adjust() - * change things at the last minute. - */ - device = true; + /* + * Check if this is non-struct page memory PFN, and cannot support + * CMOs. It could potentially be unsafe to access as cacheable. + */ + if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { + if (is_vma_cacheable) { + /* + * Whilst the VMA owner expects cacheable mapping to this + * PFN, hardware also has to support the FWB and CACHE DIC + * features. + * + * ARM64 KVM relies on kernel VA mapping to the PFN to + * perform cache maintenance as the CMO instructions work on + * virtual addresses. VM_PFNMAP region are not necessarily + * mapped to a KVA and hence the presence of hardware features + * S2FWB and CACHE DIC are mandatory to avoid the need for + * cache maintenance. + */ + if (!kvm_supports_cacheable_pfnmap()) + ret = -EFAULT; + } else { + /* + * If the page was identified as device early by looking at + * the VMA flags, vma_pagesize is already representing the + * largest quantity we can map. If instead it was mapped + * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE + * and must not be upgraded. + * + * In both cases, we don't let transparent_hugepage_adjust() + * change things at the last minute. + */ + s2_force_noncacheable = true; + } } else if (logging_active && !write_fault) { /* * Only actually map the page as writable if this was a write @@ -1330,31 +1837,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, writable = false; } - if (exec_fault && device) - return -ENOEXEC; + if (exec_fault && s2_force_noncacheable) + ret = -ENOEXEC; - read_lock(&kvm->mmu_lock); + if (ret) { + kvm_release_page_unused(page); + return ret; + } + + if (nested) + adjust_nested_fault_perms(nested, &prot, &writable); + + kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_invalidate_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) { + ret = -EAGAIN; goto out_unlock; + } /* * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { - if (fault_status == ESR_ELx_FSC_PERM && - fault_granule > PAGE_SIZE) + if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { + if (fault_is_perm && fault_granule > PAGE_SIZE) vma_pagesize = fault_granule; else vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &fault_ipa); + + if (vma_pagesize < 0) { + ret = vma_pagesize; + goto out_unlock; + } } - if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { + if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ - if (kvm_vma_mte_allowed(vma)) { + if (mte_allowed) { sanitise_mte_tags(kvm, pfn, vma_pagesize); } else { ret = -EFAULT; @@ -1368,53 +1889,138 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault) prot |= KVM_PGTABLE_PROT_X; - if (device) - prot |= KVM_PGTABLE_PROT_DEVICE; - else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) + if (s2_force_noncacheable) { + if (vfio_allow_any_uc) + prot |= KVM_PGTABLE_PROT_NORMAL_NC; + else + prot |= KVM_PGTABLE_PROT_DEVICE; + } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { prot |= KVM_PGTABLE_PROT_X; + } + + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); /* * Under the premise of getting a FSC_PERM fault, we just need to relax * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) - ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); - else - ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, + if (fault_is_perm && vma_pagesize == fault_granule) { + /* + * Drop the SW bits in favour of those stored in the + * PTE, which will be preserved. + */ + prot &= ~KVM_NV_GUEST_MAP_SZ; + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); + } else { + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, - memcache, KVM_PGTABLE_WALK_SHARED); + memcache, flags); + } + +out_unlock: + kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_fault_unlock(kvm); /* Mark the page dirty only if the fault is handled successfully */ - if (writable && !ret) { - kvm_set_pfn_dirty(pfn); + if (writable && !ret) mark_page_dirty_in_slot(kvm, memslot, gfn); - } -out_unlock: - read_unlock(&kvm->mmu_lock); - kvm_set_pfn_accessed(pfn); - kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; } /* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { - pte_t pte; - kvm_pte_t kpte; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); - write_lock(&vcpu->kvm->mmu_lock); + read_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; - kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); - write_unlock(&vcpu->kvm->mmu_lock); + KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); + read_unlock(&vcpu->kvm->mmu_lock); +} - pte = __pte(kpte); - if (pte_valid(pte)) - kvm_set_pfn_accessed(pte_pfn(pte)); +/* + * Returns true if the SEA should be handled locally within KVM if the abort + * is caused by a kernel memory allocation (e.g. stage-2 table memory). + */ +static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) +{ + /* + * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort + * taken from a guest EL to EL2 is due to a host-imposed access (e.g. + * stage-2 PTW). + */ + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return true; + + /* KVM owns the VNCR when the vCPU isn't in a nested context. */ + if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) + return true; + + /* + * Determining if an external abort during a table walk happened at + * stage-2 is only possible with S1PTW is set. Otherwise, since KVM + * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the + * PA of the stage-1 descriptor) can reach here and are reported + * with a TTW ESR value. + */ + return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); +} + +int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_run *run = vcpu->run; + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 esr_mask = ESR_ELx_EC_MASK | + ESR_ELx_IL | + ESR_ELx_FnV | + ESR_ELx_EA | + ESR_ELx_CM | + ESR_ELx_WNR | + ESR_ELx_FSC; + u64 ipa; + + /* + * Give APEI the opportunity to claim the abort before handling it + * within KVM. apei_claim_sea() expects to be called with IRQs enabled. + */ + lockdep_assert_irqs_enabled(); + if (apei_claim_sea(NULL) == 0) + return 1; + + if (host_owns_sea(vcpu, esr) || + !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) + return kvm_inject_serror(vcpu); + + /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ + if (kvm_has_ras(kvm)) + esr_mask |= ESR_ELx_SET_MASK; + + /* + * Exit to userspace, and provide faulting guest virtual and physical + * addresses in case userspace wants to emulate SEA to guest by + * writing to FAR_ELx and HPFAR_ELx registers. + */ + memset(&run->arm_sea, 0, sizeof(run->arm_sea)); + run->exit_reason = KVM_EXIT_ARM_SEA; + run->arm_sea.esr = esr & esr_mask; + + if (!(esr & ESR_ELx_FnV)) + run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); + + ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (ipa != INVALID_GPA) { + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; + run->arm_sea.gpa = ipa; + } + + return 0; } /** @@ -1430,20 +2036,32 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { - unsigned long fault_status; - phys_addr_t fault_ipa; + struct kvm_s2_trans nested_trans, *nested = NULL; + unsigned long esr; + phys_addr_t fault_ipa; /* The address we faulted on */ + phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ struct kvm_memory_slot *memslot; unsigned long hva; bool is_iabt, write_fault, writable; gfn_t gfn; int ret, idx; - fault_status = kvm_vcpu_trap_get_fault_type(vcpu); + if (kvm_vcpu_abt_issea(vcpu)) + return kvm_handle_guest_sea(vcpu); + + esr = kvm_vcpu_get_esr(vcpu); + + /* + * The fault IPA should be reliable at this point as we're not dealing + * with an SEA. + */ + ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) + return -EFAULT; - fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - if (fault_status == ESR_ELx_FSC_FAULT) { + if (esr_fsc_is_translation_fault(esr)) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); @@ -1451,36 +2069,20 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) } /* Falls between the IPA range and the PARange? */ - if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) { + if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); - if (is_iabt) - kvm_inject_pabt(vcpu, fault_ipa); - else - kvm_inject_dabt(vcpu, fault_ipa); - return 1; + return kvm_inject_sea(vcpu, is_iabt, fault_ipa); } } - /* Synchronous External Abort? */ - if (kvm_vcpu_abt_issea(vcpu)) { - /* - * For RAS the host kernel may handle this abort. - * There is no need to pass the error into the guest. - */ - if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu))) - kvm_inject_vabt(vcpu); - - return 1; - } - trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), kvm_vcpu_get_hfar(vcpu), fault_ipa); /* Check the stage-2 fault is trans. fault or write fault */ - if (fault_status != ESR_ELx_FSC_FAULT && - fault_status != ESR_ELx_FSC_PERM && - fault_status != ESR_ELx_FSC_ACCESS) { + if (!esr_fsc_is_translation_fault(esr) && + !esr_fsc_is_permission_fault(esr) && + !esr_fsc_is_access_flag_fault(esr)) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), @@ -1490,7 +2092,47 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) idx = srcu_read_lock(&vcpu->kvm->srcu); - gfn = fault_ipa >> PAGE_SHIFT; + /* + * We may have faulted on a shadow stage 2 page table if we are + * running a nested guest. In this case, we have to resolve the L2 + * IPA to the L1 IPA first, before knowing what kind of memory should + * back the L1 IPA. + * + * If the shadow stage 2 page table walk faults, then we simply inject + * this to the guest and carry on. + * + * If there are no shadow S2 PTs because S2 is disabled, there is + * nothing to walk and we treat it as a 1:1 before going through the + * canonical translation. + */ + if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && + vcpu->arch.hw_mmu->nested_stage2_enabled) { + u32 esr; + + ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); + if (ret == -EAGAIN) { + ret = 1; + goto out_unlock; + } + + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ipa = kvm_s2_trans_output(&nested_trans); + nested = &nested_trans; + } + + gfn = ipa >> PAGE_SHIFT; memslot = gfn_to_memslot(vcpu->kvm, gfn); hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); write_fault = kvm_is_write_fault(vcpu); @@ -1507,8 +2149,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) } if (kvm_vcpu_abt_iss1tw(vcpu)) { - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - ret = 1; + ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); goto out_unlock; } @@ -1534,28 +2175,34 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * faulting VA. This is always 12 bits, irrespective * of the page size. */ - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, fault_ipa); + ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + ret = io_mem_abort(vcpu, ipa); goto out_unlock; } /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); + VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); - if (fault_status == ESR_ELx_FSC_ACCESS) { + if (esr_fsc_is_access_flag_fault(esr)) { handle_access_fault(vcpu, fault_ipa); ret = 1; goto out_unlock; } - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); + VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && + !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); + + if (kvm_slot_has_gmem(memslot)) + ret = gmem_abort(vcpu, fault_ipa, nested, memslot, + esr_fsc_is_permission_fault(esr)); + else + ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, + esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; out: - if (ret == -ENOEXEC) { - kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - ret = 1; - } + if (ret == -ENOEXEC) + ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); out_unlock: srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; @@ -1570,67 +2217,36 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) (range->end - range->start) << PAGE_SHIFT, range->may_block); + kvm_nested_s2_unmap(kvm, range->may_block); return false; } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - kvm_pfn_t pfn = pte_pfn(range->pte); + u64 size = (range->end - range->start) << PAGE_SHIFT; if (!kvm->arch.mmu.pgt) return false; - WARN_ON(range->end - range->start != 1); - + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT, + size, true); /* - * If the page isn't tagged, defer to user_mem_abort() for sanitising - * the MTE tags. The S2 pte should have been unmapped by - * mmu_notifier_invalidate_range_end(). + * TODO: Handle nested_mmu structures here using the reverse mapping in + * a later version of patch series. */ - if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn))) - return false; - - /* - * We've moved a page around, probably through CoW, so let's treat - * it just like a translation fault and the map handler will clean - * the cache to the PoC. - * - * The MMU notifiers will have unmapped a huge PMD before calling - * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and - * therefore we never need to clear out a huge PMD through this - * calling path and a memcache is not required. - */ - kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, - PAGE_SIZE, __pfn_to_phys(pfn), - KVM_PGTABLE_PROT_R, NULL, 0); - - return false; } -bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { u64 size = (range->end - range->start) << PAGE_SHIFT; - kvm_pte_t kpte; - pte_t pte; if (!kvm->arch.mmu.pgt) return false; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - - kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, - range->start << PAGE_SHIFT); - pte = __pte(kpte); - return pte_valid(pte) && pte_young(pte); -} - -bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - if (!kvm->arch.mmu.pgt) - return false; - - return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, - range->start << PAGE_SHIFT); + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT, + size, false); } phys_addr_t kvm_mmu_get_httbr(void) @@ -1668,7 +2284,7 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { .virt_to_phys = kvm_host_pa, }; -int kvm_mmu_init(u32 *hyp_va_bits) +int __init kvm_mmu_init(u32 *hyp_va_bits) { int err; u32 idmap_bits; @@ -1687,16 +2303,9 @@ int kvm_mmu_init(u32 *hyp_va_bits) BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); /* - * The ID map may be configured to use an extended virtual address - * range. This is only the case if system RAM is out of range for the - * currently configured page size and VA_BITS_MIN, in which case we will - * also need the extended virtual range for the HYP ID map, or we won't - * be able to enable the EL2 MMU. - * - * However, in some cases the ID map may be configured for fewer than - * the number of VA bits used by the regular kernel stage 1. This - * happens when VA_BITS=52 and the kernel image is placed in PA space - * below 48 bits. + * The ID map is always configured for 48 bits of translation, which + * may be fewer than the number of VA bits used by the regular kernel + * stage 1, when VA_BITS=52. * * At EL2, there is only one TTBR register, and we can't switch between * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom @@ -1707,7 +2316,7 @@ int kvm_mmu_init(u32 *hyp_va_bits) * 1 VA bits to assure that the hypervisor can both ID map its code page * and map any kernel memory. */ - idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + idmap_bits = IDMAP_VA_BITS; kernel_bits = vabits_actual; *hyp_va_bits = max(idmap_bits, kernel_bits); @@ -1745,6 +2354,7 @@ int kvm_mmu_init(u32 *hyp_va_bits) goto out_destroy_pgtable; io_map_base = hyp_idmap_start; + __hyp_va_bits = *hyp_va_bits; return 0; out_destroy_pgtable: @@ -1761,20 +2371,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; + /* * At this point memslot has been committed and there is an * allocated dirty_bitmap[], dirty pages will be tracked while the * memory slot is write protected. */ - if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (log_dirty_pages) { + + if (change == KVM_MR_DELETE) + return; + /* - * If we're with initial-all-set, we don't need to write - * protect any pages because they're all reported as dirty. - * Huge pages and normal pages will be write protect gradually. + * Huge and normal pages are write-protected and split + * on either of these two cases: + * + * 1. with initial-all-set: gradually with CLEAR ioctls, */ - if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { - kvm_mmu_wp_memory_region(kvm, new->id); - } + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + return; + /* + * or + * 2. without initial-all-set: all in one shot when + * enabling dirty logging. + */ + kvm_mmu_wp_memory_region(kvm, new->id); + kvm_mmu_split_memory_region(kvm, new->id); + } else { + /* + * Free any leftovers from the eager page splitting cache. Do + * this when deleting, moving, disabling dirty logging, or + * creating the memslot (a nop). Doing it for deletes makes + * sure we don't leak memory, and there's no need to keep the + * cache around for any of the other cases. + */ + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); } } @@ -1794,9 +2426,16 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * Prevent userspace from creating a memory region outside of the IPA * space addressable by the KVM guest IPA space. */ - if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) + if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) return -EFAULT; + /* + * Only support guest_memfd backed memslots with mappable memory, since + * there aren't any CoCo VMs that support only private memory on arm64. + */ + if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) + return -EINVAL; + hva = new->userspace_addr; reg_end = hva + (new->npages << PAGE_SHIFT); @@ -1830,6 +2469,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, ret = -EINVAL; break; } + + /* + * Cacheable PFNMAP is allowed only if the hardware + * supports it. + */ + if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { + ret = -EINVAL; + break; + } } hva = min(reg_end, vma->vm_end); } while (hva < reg_end); @@ -1846,11 +2494,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_free_stage2_pgd(&kvm->arch.mmu); -} - void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { @@ -1858,7 +2501,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - unmap_stage2_range(&kvm->arch.mmu, gpa, size); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); + kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); } diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c new file mode 100644 index 000000000000..cdeeb8f09e72 --- /dev/null +++ b/arch/arm64/kvm/nested.c @@ -0,0 +1,1919 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Columbia University and Linaro Ltd. + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/bitfield.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/fixmap.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> +#include <asm/sysreg.h> + +#include "sys_regs.h" + +struct vncr_tlb { + /* The guest's VNCR_EL2 */ + u64 gva; + struct s1_walk_info wi; + struct s1_walk_result wr; + + u64 hpa; + + /* -1 when not mapped on a CPU */ + int cpu; + + /* + * true if the TLB is valid. Can only be changed with the + * mmu_lock held. + */ + bool valid; +}; + +/* + * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between + * memory usage and potential number of different sets of S2 PTs in + * the guests. Running out of S2 MMUs only affects performance (we + * will invalidate them more often). + */ +#define S2_MMU_PER_VCPU 2 + +void kvm_init_nested(struct kvm *kvm) +{ + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; + atomic_set(&kvm->arch.vncr_map_count, 0); +} + +static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) +{ + /* + * We only initialise the IPA range on the canonical MMU, which + * defines the contract between KVM and userspace on where the + * "hardware" is in the IPA space. This affects the validity of MMIO + * exits forwarded to userspace, for example. + * + * For nested S2s, we use the PARange as exposed to the guest, as it + * is allowed to use it at will to expose whatever memory map it + * wants to its own guests as it would be on real HW. + */ + return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm)); +} + +int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_s2_mmu *tmp; + int num_mmus, ret = 0; + + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) && + !cpus_have_final_cap(ARM64_HAS_HCR_NV1)) + return -EINVAL; + + if (!vcpu->arch.ctxt.vncr_array) + vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT | + __GFP_ZERO); + + if (!vcpu->arch.ctxt.vncr_array) + return -ENOMEM; + + /* + * Let's treat memory allocation failures as benign: If we fail to + * allocate anything, return an error and keep the allocated array + * alive. Userspace may try to recover by initializing the vcpu + * again, and there is no reason to affect the whole VM for this. + */ + num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU; + tmp = kvrealloc(kvm->arch.nested_mmus, + size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!tmp) + return -ENOMEM; + + swap(kvm->arch.nested_mmus, tmp); + + /* + * If we went through a realocation, adjust the MMU back-pointers in + * the previously initialised kvm_pgtable structures. + */ + if (kvm->arch.nested_mmus != tmp) + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) + kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; + + for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) + ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); + + if (ret) { + for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) + kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); + + free_page((unsigned long)vcpu->arch.ctxt.vncr_array); + vcpu->arch.ctxt.vncr_array = NULL; + + return ret; + } + + kvm->arch.nested_mmus_size = num_mmus; + + return 0; +} + +struct s2_walk_info { + u64 baddr; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int sl; + unsigned int t0sz; + bool be; + bool ha; +}; + +static u32 compute_fsc(int level, u32 fsc) +{ + return fsc | (level & 0x3); +} + +static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc) +{ + u32 esr; + + esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC; + esr |= compute_fsc(level, fsc); + return esr; +} + +static int get_ia_size(struct s2_walk_info *wi) +{ + return 64 - wi->t0sz; +} + +static int check_base_s2_limits(struct s2_walk_info *wi, + int level, int input_size, int stride) +{ + int start_size, ia_size; + + ia_size = get_ia_size(wi); + + /* Check translation limits */ + switch (BIT(wi->pgshift)) { + case SZ_64K: + if (level == 0 || (level == 1 && ia_size <= 42)) + return -EFAULT; + break; + case SZ_16K: + if (level == 0 || (level == 1 && ia_size <= 40)) + return -EFAULT; + break; + case SZ_4K: + if (level < 0 || (level == 0 && ia_size <= 42)) + return -EFAULT; + break; + } + + /* Check input size limits */ + if (input_size > ia_size) + return -EFAULT; + + /* Check number of entries in starting level table */ + start_size = input_size - ((3 - level) * stride + wi->pgshift); + if (start_size < 1 || start_size > stride + 4) + return -EFAULT; + + return 0; +} + +/* Check if output is within boundaries */ +static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) +{ + unsigned int output_size = wi->max_oa_bits; + + if (output_size != 48 && (output & GENMASK_ULL(47, output_size))) + return -1; + + return 0; +} + +static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc, + struct s2_walk_info *wi) +{ + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + /* + * Handle reversedescriptors if endianness differs between the + * host and the guest hypervisor. + */ + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; +} + +static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new, + struct s2_walk_info *wi) +{ + if (wi->be) { + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); + } else { + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + +/* + * This is essentially a C-version of the pseudo code from the ARM ARM + * AArch64.TranslationTableWalk function. I strongly recommend looking at + * that pseudocode in trying to understand this. + * + * Must be called with the kvm->srcu read lock held + */ +static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, + struct s2_walk_info *wi, struct kvm_s2_trans *out) +{ + int first_block_level, level, stride, input_size, base_lower_bound; + phys_addr_t base_addr; + unsigned int addr_top, addr_bottom; + u64 desc, new_desc; /* page table entry */ + int ret; + phys_addr_t paddr; + + switch (BIT(wi->pgshift)) { + default: + case SZ_64K: + case SZ_16K: + level = 3 - wi->sl; + first_block_level = 2; + break; + case SZ_4K: + level = 2 - wi->sl; + first_block_level = 1; + break; + } + + stride = wi->pgshift - 3; + input_size = get_ia_size(wi); + if (input_size > 48 || input_size < 25) + return -EFAULT; + + ret = check_base_s2_limits(wi, level, input_size, stride); + if (WARN_ON(ret)) + return ret; + + base_lower_bound = 3 + input_size - ((3 - level) * stride + + wi->pgshift); + base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound); + + if (check_output_size(wi, base_addr)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + return 1; + } + + addr_top = input_size - 1; + + while (1) { + phys_addr_t index; + + addr_bottom = (3 - level) * stride + wi->pgshift; + index = (ipa & GENMASK_ULL(addr_top, addr_bottom)) + >> (addr_bottom - 3); + + paddr = base_addr | index; + ret = read_guest_s2_desc(vcpu, paddr, &desc, wi); + if (ret < 0) + return ret; + + new_desc = desc; + + /* Check for valid descriptor at this point */ + if (!(desc & KVM_PTE_VALID)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) { + if (level < 3) + break; + + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + /* We're at the final level */ + if (level == 3) + break; + + if (check_output_size(wi, desc)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + out->desc = desc; + return 1; + } + + base_addr = desc & GENMASK_ULL(47, wi->pgshift); + + level += 1; + addr_top = addr_bottom - 1; + } + + if (level < first_block_level) { + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + if (check_output_size(wi, desc)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + out->desc = desc; + return 1; + } + + if (wi->ha) + new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + + if (new_desc != desc) { + ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + + if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); + out->desc = desc; + return 1; + } + + addr_bottom += contiguous_bit_shift(desc, wi, level); + + /* Calculate and return the result */ + paddr = (desc & GENMASK_ULL(47, addr_bottom)) | + (ipa & GENMASK_ULL(addr_bottom - 1, 0)); + out->output = paddr; + out->block_size = 1UL << ((3 - level) * stride + wi->pgshift); + out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + out->level = level; + out->desc = desc; + return 0; +} + +static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) +{ + wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK; + + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + wi->pgshift = 12; break; + case VTCR_EL2_TG0_16K: + wi->pgshift = 14; break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + + wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + /* Global limit for now, should eventually be per-VM */ + wi->max_oa_bits = min(get_kvm_ipa_limit(), + ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); + + wi->ha = vtcr & VTCR_EL2_HA; +} + +int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, + struct kvm_s2_trans *result) +{ + u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + struct s2_walk_info wi; + int ret; + + result->esr = 0; + + if (!vcpu_has_nv(vcpu)) + return 0; + + wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + vtcr_to_walk_info(vtcr, &wi); + + wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; + + ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result); + if (ret) + result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC); + + return ret; +} + +static unsigned int ttl_to_size(u8 ttl) +{ + int level = ttl & 3; + int gran = (ttl >> 2) & 3; + unsigned int max_size = 0; + + switch (gran) { + case TLBI_TTL_TG_4K: + switch (level) { + case 0: + break; + case 1: + max_size = SZ_1G; + break; + case 2: + max_size = SZ_2M; + break; + case 3: + max_size = SZ_4K; + break; + } + break; + case TLBI_TTL_TG_16K: + switch (level) { + case 0: + case 1: + break; + case 2: + max_size = SZ_32M; + break; + case 3: + max_size = SZ_16K; + break; + } + break; + case TLBI_TTL_TG_64K: + switch (level) { + case 0: + case 1: + /* No 52bit IPA support */ + break; + case 2: + max_size = SZ_512M; + break; + case 3: + max_size = SZ_64K; + break; + } + break; + default: /* No size information */ + break; + } + + return max_size; +} + +static u8 pgshift_level_to_ttl(u16 shift, u8 level) +{ + u8 ttl; + + switch(shift) { + case 12: + ttl = TLBI_TTL_TG_4K; + break; + case 14: + ttl = TLBI_TTL_TG_16K; + break; + case 16: + ttl = TLBI_TTL_TG_64K; + break; + default: + BUG(); + } + + ttl <<= 2; + ttl |= level & 3; + + return ttl; +} + +/* + * Compute the equivalent of the TTL field by parsing the shadow PT. The + * granule size is extracted from the cached VTCR_EL2.TG0 while the level is + * retrieved from first entry carrying the level as a tag. + */ +static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr) +{ + u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr; + kvm_pte_t pte; + u8 ttl, level; + + lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock); + + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + ttl = (TLBI_TTL_TG_4K << 2); + break; + case VTCR_EL2_TG0_16K: + ttl = (TLBI_TTL_TG_16K << 2); + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + ttl = (TLBI_TTL_TG_64K << 2); + break; + } + + tmp = addr; + +again: + /* Iteratively compute the block sizes for a particular granule size */ + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + if (sz < SZ_4K) sz = SZ_4K; + else if (sz < SZ_2M) sz = SZ_2M; + else if (sz < SZ_1G) sz = SZ_1G; + else sz = 0; + break; + case VTCR_EL2_TG0_16K: + if (sz < SZ_16K) sz = SZ_16K; + else if (sz < SZ_32M) sz = SZ_32M; + else sz = 0; + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + if (sz < SZ_64K) sz = SZ_64K; + else if (sz < SZ_512M) sz = SZ_512M; + else sz = 0; + break; + } + + if (sz == 0) + return 0; + + tmp &= ~(sz - 1); + if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL)) + goto again; + if (!(pte & PTE_VALID)) + goto again; + level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte); + if (!level) + goto again; + + ttl |= level; + + /* + * We now have found some level information in the shadow S2. Check + * that the resulting range is actually including the original IPA. + */ + sz = ttl_to_size(ttl); + if (addr < (tmp + sz)) + return ttl; + + return 0; +} + +unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); + unsigned long max_size; + u8 ttl; + + ttl = FIELD_GET(TLBI_TTL_MASK, val); + + if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) { + /* No TTL, check the shadow S2 for a hint */ + u64 addr = (val & GENMASK_ULL(35, 0)) << 12; + ttl = get_guest_mapping_ttl(mmu, addr); + } + + max_size = ttl_to_size(ttl); + + if (!max_size) { + /* Compute the maximum extent of the invalidation */ + switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + max_size = SZ_1G; + break; + case VTCR_EL2_TG0_16K: + max_size = SZ_32M; + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + /* + * No, we do not support 52bit IPA in nested yet. Once + * we do, this should be 4TB. + */ + max_size = SZ_512M; + break; + } + } + + WARN_ON(!max_size); + return max_size; +} + +/* + * We can have multiple *different* MMU contexts with the same VMID: + * + * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit + * + * - Multiple vcpus using private S2s (huh huh...), hence differing by the + * VBBTR_EL2.BADDR address + * + * - A combination of the above... + * + * We can always identify which MMU context to pick at run-time. However, + * TLB invalidation involving a VMID must take action on all the TLBs using + * this particular VMID. This translates into applying the same invalidation + * operation to all the contexts that are using this VMID. Moar phun! + */ +void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, + const union tlbi_info *info, + void (*tlbi_callback)(struct kvm_s2_mmu *, + const union tlbi_info *)) +{ + write_lock(&kvm->mmu_lock); + + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!kvm_s2_mmu_valid(mmu)) + continue; + + if (vmid == get_vmid(mmu->tlb_vttbr)) + tlbi_callback(mmu, info); + } + + write_unlock(&kvm->mmu_lock); +} + +struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + bool nested_stage2_enabled; + u64 vttbr, vtcr, hcr; + + lockdep_assert_held_write(&kvm->mmu_lock); + + vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + hcr = vcpu_read_sys_reg(vcpu, HCR_EL2); + + nested_stage2_enabled = hcr & HCR_VM; + + /* Don't consider the CnP bit for the vttbr match */ + vttbr &= ~VTTBR_CNP_BIT; + + /* + * Two possibilities when looking up a S2 MMU context: + * + * - either S2 is enabled in the guest, and we need a context that is + * S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR, + * which makes it safe from a TLB conflict perspective (a broken + * guest won't be able to generate them), + * + * - or S2 is disabled, and we need a context that is S2-disabled + * and matches the VMID only, as all TLBs are tagged by VMID even + * if S2 translation is disabled. + */ + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!kvm_s2_mmu_valid(mmu)) + continue; + + if (nested_stage2_enabled && + mmu->nested_stage2_enabled && + vttbr == mmu->tlb_vttbr && + vtcr == mmu->tlb_vtcr) + return mmu; + + if (!nested_stage2_enabled && + !mmu->nested_stage2_enabled && + get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr)) + return mmu; + } + return NULL; +} + +static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_s2_mmu *s2_mmu; + int i; + + lockdep_assert_held_write(&vcpu->kvm->mmu_lock); + + s2_mmu = lookup_s2_mmu(vcpu); + if (s2_mmu) + goto out; + + /* + * Make sure we don't always search from the same point, or we + * will always reuse a potentially active context, leaving + * free contexts unused. + */ + for (i = kvm->arch.nested_mmus_next; + i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next); + i++) { + s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size]; + + if (atomic_read(&s2_mmu->refcnt) == 0) + break; + } + BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */ + + /* Set the scene for the next search */ + kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; + + /* Make sure we don't forget to do the laundry */ + if (kvm_s2_mmu_valid(s2_mmu)) + s2_mmu->pending_unmap = true; + + /* + * The virtual VMID (modulo CnP) will be used as a key when matching + * an existing kvm_s2_mmu. + * + * We cache VTCR at allocation time, once and for all. It'd be great + * if the guest didn't screw that one up, as this is not very + * forgiving... + */ + s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT; + s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM; + +out: + atomic_inc(&s2_mmu->refcnt); + + /* + * Set the vCPU request to perform an unmap, even if the pending unmap + * originates from another vCPU. This guarantees that the MMU has been + * completely unmapped before any vCPU actually uses it, and allows + * multiple vCPUs to lend a hand with completing the unmap. + */ + if (s2_mmu->pending_unmap) + kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu); + + return s2_mmu; +} + +void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) +{ + /* CnP being set denotes an invalid entry */ + mmu->tlb_vttbr = VTTBR_CNP_BIT; + mmu->nested_stage2_enabled = false; + atomic_set(&mmu->refcnt, 0); +} + +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) +{ + /* + * If the vCPU kept its reference on the MMU after the last put, + * keep rolling with it. + */ + if (is_hyp_ctxt(vcpu)) { + if (!vcpu->arch.hw_mmu) + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + } else { + if (!vcpu->arch.hw_mmu) { + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) + vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); + } + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) + kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu); + } +} + +void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) +{ + /* Unconditionally drop the VNCR mapping if we have one */ + if (host_data_test_flag(L1_VNCR_MAPPED)) { + BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id()); + BUG_ON(is_hyp_ctxt(vcpu)); + + clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu)); + vcpu->arch.vncr_tlb->cpu = -1; + host_data_clear_flag(L1_VNCR_MAPPED); + atomic_dec(&vcpu->kvm->arch.vncr_map_count); + } + + /* + * Keep a reference on the associated stage-2 MMU if the vCPU is + * scheduling out and not in WFI emulation, suggesting it is likely to + * reuse the MMU sometime soon. + */ + if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI)) + return; + + if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) + atomic_dec(&vcpu->arch.hw_mmu->refcnt); + + vcpu->arch.hw_mmu = NULL; +} + +/* + * Returns non-zero if permission fault is handled by injecting it to the next + * level hypervisor. + */ +int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans) +{ + bool forward_fault = false; + + trans->esr = 0; + + if (!kvm_vcpu_trap_is_permission_fault(vcpu)) + return 0; + + if (kvm_vcpu_trap_is_iabt(vcpu)) { + if (vcpu_mode_priv(vcpu)) + forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans); + else + forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans); + } else { + bool write_fault = kvm_is_write_fault(vcpu); + + forward_fault = ((write_fault && !trans->writable) || + (!write_fault && !trans->readable)); + } + + if (forward_fault) + trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM); + + return forward_fault; +} + +int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2) +{ + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2); + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2); + + return kvm_inject_nested_sync(vcpu, esr_el2); +} + +static void invalidate_vncr(struct vncr_tlb *vt) +{ + vt->valid = false; + if (vt->cpu != -1) + clear_fixmap(vncr_fixmap(vt->cpu)); +} + +static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 ipa_start, ipa_end, ipa_size; + + /* + * Careful here: We end-up here from an MMU notifier, + * and this can race against a vcpu not being onlined + * yet, without the pseudo-TLB being allocated. + * + * Skip those, as they obviously don't participate in + * the invalidation at this stage. + */ + if (!vt) + continue; + + if (!vt->valid) + continue; + + ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, + vt->wr.level)); + ipa_start = vt->wr.pa & ~(ipa_size - 1); + ipa_end = ipa_start + ipa_size; + + if (ipa_end <= start || ipa_start >= end) + continue; + + invalidate_vncr(vt); + } +} + +struct s1e2_tlbi_scope { + enum { + TLBI_ALL, + TLBI_VA, + TLBI_VAA, + TLBI_ASID, + } type; + + u16 asid; + u64 va; + u64 size; +}; + +static void invalidate_vncr_va(struct kvm *kvm, + struct s1e2_tlbi_scope *scope) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 va_start, va_end, va_size; + + if (!vt->valid) + continue; + + va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, + vt->wr.level)); + va_start = vt->gva & ~(va_size - 1); + va_end = va_start + va_size; + + switch (scope->type) { + case TLBI_ALL: + break; + + case TLBI_VA: + if (va_end <= scope->va || + va_start >= (scope->va + scope->size)) + continue; + if (vt->wr.nG && vt->wr.asid != scope->asid) + continue; + break; + + case TLBI_VAA: + if (va_end <= scope->va || + va_start >= (scope->va + scope->size)) + continue; + break; + + case TLBI_ASID: + if (!vt->wr.nG || vt->wr.asid != scope->asid) + continue; + break; + } + + invalidate_vncr(vt); + } +} + +#define tlbi_va_s1_to_va(v) (u64)sign_extend64((v) << 12, 48) + +static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val, + struct s1e2_tlbi_scope *scope) +{ + switch (inst) { + case OP_TLBI_ALLE2: + case OP_TLBI_ALLE2IS: + case OP_TLBI_ALLE2OS: + case OP_TLBI_VMALLE1: + case OP_TLBI_VMALLE1IS: + case OP_TLBI_VMALLE1OS: + case OP_TLBI_ALLE2NXS: + case OP_TLBI_ALLE2ISNXS: + case OP_TLBI_ALLE2OSNXS: + case OP_TLBI_VMALLE1NXS: + case OP_TLBI_VMALLE1ISNXS: + case OP_TLBI_VMALLE1OSNXS: + scope->type = TLBI_ALL; + break; + case OP_TLBI_VAE2: + case OP_TLBI_VAE2IS: + case OP_TLBI_VAE2OS: + case OP_TLBI_VAE1: + case OP_TLBI_VAE1IS: + case OP_TLBI_VAE1OS: + case OP_TLBI_VAE2NXS: + case OP_TLBI_VAE2ISNXS: + case OP_TLBI_VAE2OSNXS: + case OP_TLBI_VAE1NXS: + case OP_TLBI_VAE1ISNXS: + case OP_TLBI_VAE1OSNXS: + case OP_TLBI_VALE2: + case OP_TLBI_VALE2IS: + case OP_TLBI_VALE2OS: + case OP_TLBI_VALE1: + case OP_TLBI_VALE1IS: + case OP_TLBI_VALE1OS: + case OP_TLBI_VALE2NXS: + case OP_TLBI_VALE2ISNXS: + case OP_TLBI_VALE2OSNXS: + case OP_TLBI_VALE1NXS: + case OP_TLBI_VALE1ISNXS: + case OP_TLBI_VALE1OSNXS: + scope->type = TLBI_VA; + scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val)); + if (!scope->size) + scope->size = SZ_1G; + scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1); + scope->asid = FIELD_GET(TLBIR_ASID_MASK, val); + break; + case OP_TLBI_ASIDE1: + case OP_TLBI_ASIDE1IS: + case OP_TLBI_ASIDE1OS: + case OP_TLBI_ASIDE1NXS: + case OP_TLBI_ASIDE1ISNXS: + case OP_TLBI_ASIDE1OSNXS: + scope->type = TLBI_ASID; + scope->asid = FIELD_GET(TLBIR_ASID_MASK, val); + break; + case OP_TLBI_VAAE1: + case OP_TLBI_VAAE1IS: + case OP_TLBI_VAAE1OS: + case OP_TLBI_VAAE1NXS: + case OP_TLBI_VAAE1ISNXS: + case OP_TLBI_VAAE1OSNXS: + case OP_TLBI_VAALE1: + case OP_TLBI_VAALE1IS: + case OP_TLBI_VAALE1OS: + case OP_TLBI_VAALE1NXS: + case OP_TLBI_VAALE1ISNXS: + case OP_TLBI_VAALE1OSNXS: + scope->type = TLBI_VAA; + scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val)); + if (!scope->size) + scope->size = SZ_1G; + scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1); + break; + case OP_TLBI_RVAE2: + case OP_TLBI_RVAE2IS: + case OP_TLBI_RVAE2OS: + case OP_TLBI_RVAE1: + case OP_TLBI_RVAE1IS: + case OP_TLBI_RVAE1OS: + case OP_TLBI_RVAE2NXS: + case OP_TLBI_RVAE2ISNXS: + case OP_TLBI_RVAE2OSNXS: + case OP_TLBI_RVAE1NXS: + case OP_TLBI_RVAE1ISNXS: + case OP_TLBI_RVAE1OSNXS: + case OP_TLBI_RVALE2: + case OP_TLBI_RVALE2IS: + case OP_TLBI_RVALE2OS: + case OP_TLBI_RVALE1: + case OP_TLBI_RVALE1IS: + case OP_TLBI_RVALE1OS: + case OP_TLBI_RVALE2NXS: + case OP_TLBI_RVALE2ISNXS: + case OP_TLBI_RVALE2OSNXS: + case OP_TLBI_RVALE1NXS: + case OP_TLBI_RVALE1ISNXS: + case OP_TLBI_RVALE1OSNXS: + scope->type = TLBI_VA; + scope->va = decode_range_tlbi(val, &scope->size, &scope->asid); + break; + case OP_TLBI_RVAAE1: + case OP_TLBI_RVAAE1IS: + case OP_TLBI_RVAAE1OS: + case OP_TLBI_RVAAE1NXS: + case OP_TLBI_RVAAE1ISNXS: + case OP_TLBI_RVAAE1OSNXS: + case OP_TLBI_RVAALE1: + case OP_TLBI_RVAALE1IS: + case OP_TLBI_RVAALE1OS: + case OP_TLBI_RVAALE1NXS: + case OP_TLBI_RVAALE1ISNXS: + case OP_TLBI_RVAALE1OSNXS: + scope->type = TLBI_VAA; + scope->va = decode_range_tlbi(val, &scope->size, NULL); + break; + } +} + +void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val) +{ + struct s1e2_tlbi_scope scope = {}; + + compute_s1_tlbi_range(vcpu, inst, val, &scope); + + guard(write_lock)(&vcpu->kvm->mmu_lock); + invalidate_vncr_va(vcpu->kvm, &scope); +} + +void kvm_nested_s2_wp(struct kvm *kvm) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu)); + } + + kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits)); +} + +void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block); + } + + kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits)); +} + +void kvm_nested_s2_flush(struct kvm *kvm) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu)); + } +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!WARN_ON(atomic_read(&mmu->refcnt))) + kvm_free_stage2_pgd(mmu); + } + kvfree(kvm->arch.nested_mmus); + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; + kvm_uninit_stage2_mmu(kvm); +} + +/* + * Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter: + * + * - We introduce an internal representation of a vcpu-private TLB, + * representing the mapping between the guest VA contained in VNCR_EL2, + * the IPA the guest's EL2 PTs point to, and the actual PA this lives at. + * + * - On translation fault from a nested VNCR access, we create such a TLB. + * If there is no mapping to describe, the guest inherits the fault. + * Crucially, no actual mapping is done at this stage. + * + * - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above + * TLB exists, we map it in the fixmap for this CPU, and run with it. We + * have to respect the permissions dictated by the guest, but not the + * memory type (FWB is a must). + * + * - Note that we usually don't do a vcpu_load() on the back of a fault + * (unless we are preempted), so the resolution of a translation fault + * must go via a request that will map the VNCR page in the fixmap. + * vcpu_load() might as well use the same mechanism. + * + * - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was + * mapped, we unmap it. Yes it is that simple. The TLB still exists + * though, and may be reused at a later load. + * + * - On permission fault, we simply forward the fault to the guest's EL2. + * Get out of my way. + * + * - On any TLBI for the EL2&0 translation regime, we must find any TLB that + * intersects with the TLBI request, invalidate it, and unmap the page + * from the fixmap. Because we need to look at all the vcpu-private TLBs, + * this requires some wide-ranging locking to ensure that nothing races + * against it. This may require some refcounting to avoid the search when + * no such TLB is present. + * + * - On MMU notifiers, we must invalidate our TLB in a similar way, but + * looking at the IPA instead. The funny part is that there may not be a + * stage-2 mapping for this page if L1 hasn't accessed it using LD/ST + * instructions. + */ + +int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu) +{ + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return 0; + + vcpu->arch.vncr_tlb = kzalloc(sizeof(*vcpu->arch.vncr_tlb), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.vncr_tlb) + return -ENOMEM; + + return 0; +} + +static u64 read_vncr_el2(struct kvm_vcpu *vcpu) +{ + return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48); +} + +static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem) +{ + struct kvm_memory_slot *memslot; + bool write_fault, writable; + unsigned long mmu_seq; + struct vncr_tlb *vt; + struct page *page; + u64 va, pfn, gfn; + int ret; + + vt = vcpu->arch.vncr_tlb; + + /* + * If we're about to walk the EL2 S1 PTs, we must invalidate the + * current TLB, as it could be sampled from another vcpu doing a + * TLBI *IS. A real CPU wouldn't do that, but we only keep a single + * translation, so not much of a choice. + * + * We also prepare the next walk wilst we're at it. + */ + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + invalidate_vncr(vt); + + vt->wi = (struct s1_walk_info) { + .regime = TR_EL20, + .as_el0 = false, + .pan = false, + }; + vt->wr = (struct s1_walk_result){}; + } + + guard(srcu)(&vcpu->kvm->srcu); + + va = read_vncr_el2(vcpu); + + ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va); + if (ret) + return ret; + + write_fault = kvm_is_write_fault(vcpu); + + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + gfn = vt->wr.pa >> PAGE_SHIFT; + memslot = gfn_to_memslot(vcpu->kvm, gfn); + if (!memslot) + return -EFAULT; + + *is_gmem = kvm_slot_has_gmem(memslot); + if (!*is_gmem) { + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); + if (is_error_noslot_pfn(pfn) || (write_fault && !writable)) + return -EFAULT; + } else { + ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL); + if (ret) { + kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE, + write_fault, false, false); + return ret; + } + } + + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) + return -EAGAIN; + + vt->gva = va; + vt->hpa = pfn << PAGE_SHIFT; + vt->valid = true; + vt->cpu = -1; + + kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu); + kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw); + } + + if (vt->wr.pw) + mark_page_dirty(vcpu->kvm, gfn); + + return 0; +} + +static void inject_vncr_perm(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 esr = kvm_vcpu_get_esr(vcpu); + + /* Adjust the fault level to reflect that of the guest's */ + esr &= ~ESR_ELx_FSC; + esr |= FIELD_PREP(ESR_ELx_FSC, + ESR_ELx_FSC_PERM_L(vt->wr.level)); + + kvm_inject_nested_sync(vcpu, esr); +} + +static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + + lockdep_assert_held_read(&vcpu->kvm->mmu_lock); + + if (!vt->valid) + return false; + + if (read_vncr_el2(vcpu) != vt->gva) + return false; + + if (vt->wr.nG) { + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + u64 ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + u16 asid; + + asid = FIELD_GET(TTBR_ASID_MASK, ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + return asid == vt->wr.asid; + } + + return true; +} + +int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 esr = kvm_vcpu_get_esr(vcpu); + + WARN_ON_ONCE(!(esr & ESR_ELx_VNCR)); + + if (kvm_vcpu_abt_issea(vcpu)) + return kvm_handle_guest_sea(vcpu); + + if (esr_fsc_is_permission_fault(esr)) { + inject_vncr_perm(vcpu); + } else if (esr_fsc_is_translation_fault(esr)) { + bool valid, is_gmem = false; + int ret; + + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + valid = kvm_vncr_tlb_lookup(vcpu); + + if (!valid) + ret = kvm_translate_vncr(vcpu, &is_gmem); + else + ret = -EPERM; + + switch (ret) { + case -EAGAIN: + /* Let's try again... */ + break; + case -ENOMEM: + /* + * For guest_memfd, this indicates that it failed to + * create a folio to back the memory. Inform userspace. + */ + if (is_gmem) + return 0; + /* Otherwise, let's try again... */ + break; + case -EFAULT: + case -EIO: + case -EHWPOISON: + if (is_gmem) + return 0; + fallthrough; + case -EINVAL: + case -ENOENT: + case -EACCES: + /* + * Translation failed, inject the corresponding + * exception back to EL2. + */ + BUG_ON(!vt->wr.failed); + + esr &= ~ESR_ELx_FSC; + esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst); + + kvm_inject_nested_sync(vcpu, esr); + break; + case -EPERM: + /* Hack to deal with POE until we get kernel support */ + inject_vncr_perm(vcpu); + break; + case 0: + break; + } + } else { + WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr); + } + + return 1; +} + +static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + pgprot_t prot; + + guard(preempt)(); + guard(read_lock)(&vcpu->kvm->mmu_lock); + + /* + * The request to map VNCR may have raced against some other + * event, such as an interrupt, and may not be valid anymore. + */ + if (is_hyp_ctxt(vcpu)) + return; + + /* + * Check that the pseudo-TLB is valid and that VNCR_EL2 still + * contains the expected value. If it doesn't, we simply bail out + * without a mapping -- a transformed MSR/MRS will generate the + * fault and allows us to populate the pseudo-TLB. + */ + if (!vt->valid) + return; + + if (read_vncr_el2(vcpu) != vt->gva) + return; + + if (vt->wr.nG) { + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + u64 ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + u16 asid; + + asid = FIELD_GET(TTBR_ASID_MASK, ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + if (asid != vt->wr.asid) + return; + } + + vt->cpu = smp_processor_id(); + + if (vt->wr.pw && vt->wr.pr) + prot = PAGE_KERNEL; + else if (vt->wr.pr) + prot = PAGE_KERNEL_RO; + else + prot = PAGE_NONE; + + /* + * We can't map write-only (or no permission at all) in the kernel, + * but the guest can do it if using POE, so we'll have to turn a + * translation fault into a permission fault at runtime. + * FIXME: WO doesn't work at all, need POE support in the kernel. + */ + if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) { + __set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot); + host_data_set_flag(L1_VNCR_MAPPED); + atomic_inc(&vcpu->kvm->arch.vncr_map_count); + } +} + +#define has_tgran_2(__r, __sz) \ + ({ \ + u64 _s1, _s2, _mmfr0 = __r; \ + \ + _s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz##_2, _mmfr0); \ + \ + _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz, _mmfr0); \ + \ + ((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \ + _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \ + (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \ + _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \ + }) +/* + * Our emulated CPU doesn't support all the possible features. For the + * sake of simplicity (and probably mental sanity), wipe out a number + * of feature bits we don't intend to support for the time being. + * This list should get updated as new features get added to the NV + * support, and new extension to the architecture. + */ +u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) +{ + u64 orig_val = val; + + switch (reg) { + case SYS_ID_AA64ISAR0_EL1: + /* Support everything but TME */ + val &= ~ID_AA64ISAR0_EL1_TME; + break; + + case SYS_ID_AA64ISAR1_EL1: + /* Support everything but LS64 and Spec Invalidation */ + val &= ~(ID_AA64ISAR1_EL1_LS64 | + ID_AA64ISAR1_EL1_SPECRES); + break; + + case SYS_ID_AA64PFR0_EL1: + /* No RME, AMU, MPAM, or S-EL2 */ + val &= ~(ID_AA64PFR0_EL1_RME | + ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SEL2 | + ID_AA64PFR0_EL1_EL3 | + ID_AA64PFR0_EL1_EL2 | + ID_AA64PFR0_EL1_EL1 | + ID_AA64PFR0_EL1_EL0); + /* 64bit only at any EL */ + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP); + break; + + case SYS_ID_AA64PFR1_EL1: + /* Only support BTI, SSBS, CSV2_frac */ + val &= ~(ID_AA64PFR1_EL1_PFAR | + ID_AA64PFR1_EL1_MTEX | + ID_AA64PFR1_EL1_THE | + ID_AA64PFR1_EL1_GCS | + ID_AA64PFR1_EL1_MTE_frac | + ID_AA64PFR1_EL1_NMI | + ID_AA64PFR1_EL1_SME | + ID_AA64PFR1_EL1_RES0 | + ID_AA64PFR1_EL1_MPAM_frac | + ID_AA64PFR1_EL1_MTE); + break; + + case SYS_ID_AA64MMFR0_EL1: + /* Hide ExS, Secure Memory */ + val &= ~(ID_AA64MMFR0_EL1_EXS | + ID_AA64MMFR0_EL1_TGRAN4_2 | + ID_AA64MMFR0_EL1_TGRAN16_2 | + ID_AA64MMFR0_EL1_TGRAN64_2 | + ID_AA64MMFR0_EL1_SNSMEM); + + /* Hide CNTPOFF if present */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP); + + /* Disallow unsupported S2 page sizes */ + switch (PAGE_SIZE) { + case SZ_64K: + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI); + fallthrough; + case SZ_16K: + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI); + fallthrough; + case SZ_4K: + /* Support everything */ + break; + } + + /* + * Since we can't support a guest S2 page size smaller + * than the host's own page size (due to KVM only + * populating its own S2 using the kernel's page + * size), advertise the limitation using FEAT_GTG. + */ + switch (PAGE_SIZE) { + case SZ_4K: + if (has_tgran_2(orig_val, 4)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP); + fallthrough; + case SZ_16K: + if (has_tgran_2(orig_val, 16)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP); + fallthrough; + case SZ_64K: + if (has_tgran_2(orig_val, 64)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP); + break; + } + + /* Cap PARange to 48bits */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48); + break; + + case SYS_ID_AA64MMFR1_EL1: + val &= ~(ID_AA64MMFR1_EL1_CMOW | + ID_AA64MMFR1_EL1_nTLBPA | + ID_AA64MMFR1_EL1_ETS); + + /* FEAT_E2H0 implies no VHE */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) + val &= ~ID_AA64MMFR1_EL1_VH; + + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF); + break; + + case SYS_ID_AA64MMFR2_EL1: + val &= ~(ID_AA64MMFR2_EL1_BBM | + ID_AA64MMFR2_EL1_TTL | + GENMASK_ULL(47, 44) | + ID_AA64MMFR2_EL1_ST | + ID_AA64MMFR2_EL1_CCIDX | + ID_AA64MMFR2_EL1_VARange); + + /* Force TTL support */ + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP); + break; + + case SYS_ID_AA64MMFR4_EL1: + /* + * You get EITHER + * + * - FEAT_VHE without FEAT_E2H0 + * - FEAT_NV limited to FEAT_NV2 + * - HCR_EL2.NV1 being RES0 + * + * OR + * + * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV + * + * Life is too short for anything else. + */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) { + val = 0; + } else { + val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1); + } + break; + + case SYS_ID_AA64DFR0_EL1: + /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */ + val &= ~(ID_AA64DFR0_EL1_ExtTrcBuff | + ID_AA64DFR0_EL1_BRBE | + ID_AA64DFR0_EL1_MTPMU | + ID_AA64DFR0_EL1_TraceBuffer | + ID_AA64DFR0_EL1_TraceFilt | + ID_AA64DFR0_EL1_PMSVer | + ID_AA64DFR0_EL1_CTX_CMPs | + ID_AA64DFR0_EL1_SEBEP | + ID_AA64DFR0_EL1_PMSS | + ID_AA64DFR0_EL1_TraceVer); + + /* + * FEAT_Debugv8p9 requires support for extended breakpoints / + * watchpoints. + */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); + break; + } + + return val; +} + +u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg sr, u64 v) +{ + struct kvm_sysreg_masks *masks; + + masks = vcpu->kvm->arch.sysreg_masks; + + if (masks) { + sr -= __SANITISED_REG_START__; + + v &= ~masks->mask[sr].res0; + v |= masks->mask[sr].res1; + } + + return v; +} + +static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) +{ + int i = sr - __SANITISED_REG_START__; + + BUILD_BUG_ON(!__builtin_constant_p(sr)); + BUILD_BUG_ON(sr < __SANITISED_REG_START__); + BUILD_BUG_ON(sr >= NR_SYS_REGS); + + kvm->arch.sysreg_masks->mask[i].res0 = res0; + kvm->arch.sysreg_masks->mask[i].res1 = res1; +} + +int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 res0, res1; + + lockdep_assert_held(&kvm->arch.config_lock); + + if (kvm->arch.sysreg_masks) + goto out; + + kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), + GFP_KERNEL_ACCOUNT); + if (!kvm->arch.sysreg_masks) + return -ENOMEM; + + /* VTTBR_EL2 */ + res0 = res1 = 0; + if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16)) + res0 |= GENMASK(63, 56); + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP)) + res0 |= VTTBR_CNP_BIT; + set_sysreg_masks(kvm, VTTBR_EL2, res0, res1); + + /* VTCR_EL2 */ + res0 = GENMASK(63, 32) | GENMASK(30, 20); + res1 = BIT(31); + set_sysreg_masks(kvm, VTCR_EL2, res0, res1); + + /* VMPIDR_EL2 */ + res0 = GENMASK(63, 40) | GENMASK(30, 24); + res1 = BIT(31); + set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1); + + /* HCR_EL2 */ + get_reg_fixed_bits(kvm, HCR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HCR_EL2, res0, res1); + + /* HCRX_EL2 */ + get_reg_fixed_bits(kvm, HCRX_EL2, &res0, &res1); + set_sysreg_masks(kvm, HCRX_EL2, res0, res1); + + /* HFG[RW]TR_EL2 */ + get_reg_fixed_bits(kvm, HFGRTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGRTR_EL2, res0, res1); + get_reg_fixed_bits(kvm, HFGWTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGWTR_EL2, res0, res1); + + /* HDFG[RW]TR_EL2 */ + get_reg_fixed_bits(kvm, HDFGRTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGRTR_EL2, res0, res1); + get_reg_fixed_bits(kvm, HDFGWTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGWTR_EL2, res0, res1); + + /* HFGITR_EL2 */ + get_reg_fixed_bits(kvm, HFGITR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGITR_EL2, res0, res1); + + /* HAFGRTR_EL2 - not a lot to see here */ + get_reg_fixed_bits(kvm, HAFGRTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); + + /* HFG[RW]TR2_EL2 */ + get_reg_fixed_bits(kvm, HFGRTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGRTR2_EL2, res0, res1); + get_reg_fixed_bits(kvm, HFGWTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGWTR2_EL2, res0, res1); + + /* HDFG[RW]TR2_EL2 */ + get_reg_fixed_bits(kvm, HDFGRTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGRTR2_EL2, res0, res1); + get_reg_fixed_bits(kvm, HDFGWTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGWTR2_EL2, res0, res1); + + /* HFGITR2_EL2 */ + get_reg_fixed_bits(kvm, HFGITR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1); + + /* TCR2_EL2 */ + get_reg_fixed_bits(kvm, TCR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, TCR2_EL2, res0, res1); + + /* SCTLR_EL1 */ + get_reg_fixed_bits(kvm, SCTLR_EL1, &res0, &res1); + set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + + /* SCTLR2_ELx */ + get_reg_fixed_bits(kvm, SCTLR2_EL1, &res0, &res1); + set_sysreg_masks(kvm, SCTLR2_EL1, res0, res1); + get_reg_fixed_bits(kvm, SCTLR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, SCTLR2_EL2, res0, res1); + + /* MDCR_EL2 */ + get_reg_fixed_bits(kvm, MDCR_EL2, &res0, &res1); + set_sysreg_masks(kvm, MDCR_EL2, res0, res1); + + /* CNTHCTL_EL2 */ + res0 = GENMASK(63, 20); + res1 = 0; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP)) + res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) { + res0 |= CNTHCTL_ECV; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP)) + res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT | + CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT); + } + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= GENMASK(11, 8); + set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1); + + /* ICH_HCR_EL2 */ + res0 = ICH_HCR_EL2_RES0; + res1 = ICH_HCR_EL2_RES1; + if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS)) + res0 |= ICH_HCR_EL2_TDIR; + /* No GICv4 is presented to the guest */ + res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount; + set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1); + + /* VNCR_EL2 */ + set_sysreg_masks(kvm, VNCR_EL2, VNCR_EL2_RES0, VNCR_EL2_RES1); + +out: + for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) + __vcpu_rmw_sys_reg(vcpu, sr, |=, 0); + + return 0; +} + +void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) { + struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; + + write_lock(&vcpu->kvm->mmu_lock); + if (mmu->pending_unmap) { + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true); + mmu->pending_unmap = false; + } + write_unlock(&vcpu->kvm->mmu_lock); + } + + if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu)) + kvm_map_l1_vncr(vcpu); + + /* Must be last, as may switch context! */ + if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu)) + kvm_inject_nested_irq(vcpu); +} + +/* + * One of the many architectural bugs in FEAT_NV2 is that the guest hypervisor + * can write to HCR_EL2 behind our back, potentially changing the exception + * routing / masking for even the host context. + * + * What follows is some slop to (1) react to exception routing / masking and (2) + * preserve the pending SError state across translation regimes. + */ +void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu) +{ + if (!vcpu_has_nv(vcpu)) + return; + + if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING))) + kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu)); +} + +void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu) +{ + unsigned long *hcr = vcpu_hcr(vcpu); + + if (!vcpu_has_nv(vcpu)) + return; + + /* + * We previously decided that an SError was deliverable to the guest. + * Reap the pending state from HCR_EL2 and... + */ + if (unlikely(__test_and_clear_bit(__ffs(HCR_VSE), hcr))) + vcpu_set_flag(vcpu, NESTED_SERROR_PENDING); + + /* + * Re-attempt SError injection in case the deliverability has changed, + * which is necessary to faithfully emulate WFI the case of a pending + * SError being a wakeup condition. + */ + if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING))) + kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu)); +} + +/* + * KVM unconditionally sets most of these traps anyway but use an allowlist + * to document the guest hypervisor traps that may take precedence and guard + * against future changes to the non-nested trap configuration. + */ +#define NV_MDCR_GUEST_INCLUDE (MDCR_EL2_TDE | \ + MDCR_EL2_TDA | \ + MDCR_EL2_TDRA | \ + MDCR_EL2_TTRF | \ + MDCR_EL2_TPMS | \ + MDCR_EL2_TPM | \ + MDCR_EL2_TPMCR | \ + MDCR_EL2_TDCC | \ + MDCR_EL2_TDOSA) + +void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu) +{ + u64 guest_mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (is_nested_ctxt(vcpu)) + vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE); + /* + * In yet another example where FEAT_NV2 is fscking broken, accesses + * to MDSCR_EL1 are redirected to the VNCR despite having an effect + * at EL2. Use a big hammer to apply sanity. + * + * Unless of course we have FEAT_FGT, in which case we can precisely + * trap MDSCR_EL1. + */ + else if (!cpus_have_final_cap(ARM64_HAS_FGT)) + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; +} diff --git a/arch/arm64/kvm/pauth.c b/arch/arm64/kvm/pauth.c new file mode 100644 index 000000000000..d5eb3ae876be --- /dev/null +++ b/arch/arm64/kvm/pauth.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024 - Google LLC + * Author: Marc Zyngier <maz@kernel.org> + * + * Primitive PAuth emulation for ERETAA/ERETAB. + * + * This code assumes that is is run from EL2, and that it is part of + * the emulation of ERETAx for a guest hypervisor. That's a lot of + * baked-in assumptions and shortcuts. + * + * Do no reuse for anything else! + */ + +#include <linux/kvm_host.h> + +#include <asm/gpr-num.h> +#include <asm/kvm_emulate.h> +#include <asm/pointer_auth.h> + +/* PACGA Xd, Xn, Xm */ +#define PACGA(d,n,m) \ + asm volatile(__DEFINE_ASM_GPR_NUMS \ + ".inst 0x9AC03000 |" \ + "(.L__gpr_num_%[Rd] << 0) |" \ + "(.L__gpr_num_%[Rn] << 5) |" \ + "(.L__gpr_num_%[Rm] << 16)\n" \ + : [Rd] "=r" ((d)) \ + : [Rn] "r" ((n)), [Rm] "r" ((m))) + +static u64 compute_pac(struct kvm_vcpu *vcpu, u64 ptr, + struct ptrauth_key ikey) +{ + struct ptrauth_key gkey; + u64 mod, pac = 0; + + preempt_disable(); + + if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) + mod = __vcpu_sys_reg(vcpu, SP_EL2); + else + mod = read_sysreg(sp_el1); + + gkey.lo = read_sysreg_s(SYS_APGAKEYLO_EL1); + gkey.hi = read_sysreg_s(SYS_APGAKEYHI_EL1); + + __ptrauth_key_install_nosync(APGA, ikey); + isb(); + + PACGA(pac, ptr, mod); + isb(); + + __ptrauth_key_install_nosync(APGA, gkey); + + preempt_enable(); + + /* PAC in the top 32bits */ + return pac; +} + +static bool effective_tbi(struct kvm_vcpu *vcpu, bool bit55) +{ + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + bool tbi, tbid; + + /* + * Since we are authenticating an instruction address, we have + * to take TBID into account. If E2H==0, ignore VA[55], as + * TCR_EL2 only has a single TBI/TBID. If VA[55] was set in + * this case, this is likely a guest bug... + */ + if (!vcpu_el2_e2h_is_set(vcpu)) { + tbi = tcr & BIT(20); + tbid = tcr & BIT(29); + } else if (bit55) { + tbi = tcr & TCR_TBI1; + tbid = tcr & TCR_TBID1; + } else { + tbi = tcr & TCR_TBI0; + tbid = tcr & TCR_TBID0; + } + + return tbi && !tbid; +} + +static int compute_bottom_pac(struct kvm_vcpu *vcpu, bool bit55) +{ + static const int maxtxsz = 39; // Revisit these two values once + static const int mintxsz = 16; // (if) we support TTST/LVA/LVA2 + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + int txsz; + + if (!vcpu_el2_e2h_is_set(vcpu) || !bit55) + txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + else + txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + + return 64 - clamp(txsz, mintxsz, maxtxsz); +} + +static u64 compute_pac_mask(struct kvm_vcpu *vcpu, bool bit55) +{ + int bottom_pac; + u64 mask; + + bottom_pac = compute_bottom_pac(vcpu, bit55); + + mask = GENMASK(54, bottom_pac); + if (!effective_tbi(vcpu, bit55)) + mask |= GENMASK(63, 56); + + return mask; +} + +static u64 to_canonical_addr(struct kvm_vcpu *vcpu, u64 ptr, u64 mask) +{ + bool bit55 = !!(ptr & BIT(55)); + + if (bit55) + return ptr | mask; + + return ptr & ~mask; +} + +static u64 corrupt_addr(struct kvm_vcpu *vcpu, u64 ptr) +{ + bool bit55 = !!(ptr & BIT(55)); + u64 mask, error_code; + int shift; + + if (effective_tbi(vcpu, bit55)) { + mask = GENMASK(54, 53); + shift = 53; + } else { + mask = GENMASK(62, 61); + shift = 61; + } + + if (esr_iss_is_eretab(kvm_vcpu_get_esr(vcpu))) + error_code = 2 << shift; + else + error_code = 1 << shift; + + ptr &= ~mask; + ptr |= error_code; + + return ptr; +} + +/* + * Authenticate an ERETAA/ERETAB instruction, returning true if the + * authentication succeeded and false otherwise. In all cases, *elr + * contains the VA to ERET to. Potential exception injection is left + * to the caller. + */ +bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr) +{ + u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 ptr, cptr, pac, mask; + struct ptrauth_key ikey; + + *elr = ptr = vcpu_read_sys_reg(vcpu, ELR_EL2); + + /* We assume we're already in the context of an ERETAx */ + if (esr_iss_is_eretab(esr)) { + if (!(sctlr & SCTLR_EL1_EnIB)) + return true; + + ikey.lo = __vcpu_sys_reg(vcpu, APIBKEYLO_EL1); + ikey.hi = __vcpu_sys_reg(vcpu, APIBKEYHI_EL1); + } else { + if (!(sctlr & SCTLR_EL1_EnIA)) + return true; + + ikey.lo = __vcpu_sys_reg(vcpu, APIAKEYLO_EL1); + ikey.hi = __vcpu_sys_reg(vcpu, APIAKEYHI_EL1); + } + + mask = compute_pac_mask(vcpu, !!(ptr & BIT(55))); + cptr = to_canonical_addr(vcpu, ptr, mask); + + pac = compute_pac(vcpu, cptr, ikey); + + /* + * Slightly deviate from the pseudocode: if we have a PAC + * match with the signed pointer, then it must be good. + * Anything after this point is pure error handling. + */ + if ((pac & mask) == (ptr & mask)) { + *elr = cptr; + return true; + } + + /* + * Authentication failed, corrupt the canonical address if + * PAuth2 isn't implemented, or some XORing if it is. + */ + if (!kvm_has_pauth(vcpu->kvm, PAuth2)) + cptr = corrupt_addr(vcpu, cptr); + else + cptr = ptr ^ (pac & mask); + + *elr = cptr; + return false; +} diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index cf56958b1492..d7a0f69a9982 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -4,38 +4,26 @@ * Author: Quentin Perret <qperret@google.com> */ +#include <linux/init.h> +#include <linux/interval_tree_generic.h> +#include <linux/kmemleak.h> #include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> #include <linux/memblock.h> #include <linux/mutex.h> -#include <linux/sort.h> #include <asm/kvm_pkvm.h> #include "hyp_constants.h" +DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); + static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); phys_addr_t hyp_mem_base; phys_addr_t hyp_mem_size; -static int cmp_hyp_memblock(const void *p1, const void *p2) -{ - const struct memblock_region *r1 = p1; - const struct memblock_region *r2 = p2; - - return r1->base < r2->base ? -1 : (r1->base > r2->base); -} - -static void __init sort_memblock_regions(void) -{ - sort(hyp_memory, - *hyp_memblock_nr_ptr, - sizeof(struct memblock_region), - cmp_hyp_memblock, - NULL); -} - static int __init register_memblock_regions(void) { struct memblock_region *reg; @@ -47,7 +35,6 @@ static int __init register_memblock_regions(void) hyp_memory[*hyp_memblock_nr_ptr] = *reg; (*hyp_memblock_nr_ptr)++; } - sort_memblock_regions(); return 0; } @@ -74,6 +61,8 @@ void __init kvm_hyp_reserve(void) hyp_mem_pages += host_s2_pgtable_pages(); hyp_mem_pages += hyp_vm_table_pages(); hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); + hyp_mem_pages += pkvm_selftest_pages(); + hyp_mem_pages += hyp_ffa_proxy_pages(); /* * Try to allocate a PMD-aligned region to reduce TLB pressure once @@ -96,6 +85,47 @@ void __init kvm_hyp_reserve(void) hyp_mem_base); } +static void __pkvm_destroy_hyp_vm(struct kvm *kvm) +{ + if (pkvm_hyp_vm_is_created(kvm)) { + WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, + kvm->arch.pkvm.handle)); + } else if (kvm->arch.pkvm.handle) { + /* + * The VM could have been reserved but hyp initialization has + * failed. Make sure to unreserve it. + */ + kvm_call_hyp_nvhe(__pkvm_unreserve_vm, kvm->arch.pkvm.handle); + } + + kvm->arch.pkvm.handle = 0; + kvm->arch.pkvm.is_created = false; + free_hyp_memcache(&kvm->arch.pkvm.teardown_mc); + free_hyp_memcache(&kvm->arch.pkvm.stage2_teardown_mc); +} + +static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) +{ + size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); + pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle; + void *hyp_vcpu; + int ret; + + vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + + hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vcpu) + return -ENOMEM; + + ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu); + if (!ret) + vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED); + else + free_pages_exact(hyp_vcpu, hyp_vcpu_sz); + + return ret; +} + /* * Allocates and donates memory for hypervisor VM structs at EL2. * @@ -106,19 +136,16 @@ void __init kvm_hyp_reserve(void) * * Return 0 on success, negative error code on failure. */ -static int __pkvm_create_hyp_vm(struct kvm *host_kvm) +static int __pkvm_create_hyp_vm(struct kvm *kvm) { - size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz; - struct kvm_vcpu *host_vcpu; - pkvm_handle_t handle; + size_t pgd_sz, hyp_vm_sz; void *pgd, *hyp_vm; - unsigned long idx; int ret; - if (host_kvm->created_vcpus < 1) + if (kvm->created_vcpus < 1) return -EINVAL; - pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr); + pgd_sz = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr); /* * The PGD pages will be reclaimed using a hyp_memcache which implies @@ -132,7 +159,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) /* Allocate memory to donate to hyp for vm and vcpu pointers. */ hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, size_mul(sizeof(void *), - host_kvm->created_vcpus))); + kvm->created_vcpus))); hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); if (!hyp_vm) { ret = -ENOMEM; @@ -140,44 +167,15 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) } /* Donate the VM memory to hyp and let hyp initialize it. */ - ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd); - if (ret < 0) + ret = kvm_call_hyp_nvhe(__pkvm_init_vm, kvm, hyp_vm, pgd); + if (ret) goto free_vm; - handle = ret; - - host_kvm->arch.pkvm.handle = handle; - - /* Donate memory for the vcpus at hyp and initialize it. */ - hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); - kvm_for_each_vcpu(idx, host_vcpu, host_kvm) { - void *hyp_vcpu; - - /* Indexing of the vcpus to be sequential starting at 0. */ - if (WARN_ON(host_vcpu->vcpu_idx != idx)) { - ret = -EINVAL; - goto destroy_vm; - } - - hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); - if (!hyp_vcpu) { - ret = -ENOMEM; - goto destroy_vm; - } - - ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu, - hyp_vcpu); - if (ret) { - free_pages_exact(hyp_vcpu, hyp_vcpu_sz); - goto destroy_vm; - } - } + kvm->arch.pkvm.is_created = true; + kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); return 0; - -destroy_vm: - pkvm_destroy_hyp_vm(host_kvm); - return ret; free_vm: free_pages_exact(hyp_vm, hyp_vm_sz); free_pgd: @@ -185,31 +183,305 @@ free_pgd: return ret; } -int pkvm_create_hyp_vm(struct kvm *host_kvm) +bool pkvm_hyp_vm_is_created(struct kvm *kvm) +{ + return READ_ONCE(kvm->arch.pkvm.is_created); +} + +int pkvm_create_hyp_vm(struct kvm *kvm) { int ret = 0; - mutex_lock(&host_kvm->lock); - if (!host_kvm->arch.pkvm.handle) - ret = __pkvm_create_hyp_vm(host_kvm); - mutex_unlock(&host_kvm->lock); + mutex_lock(&kvm->arch.config_lock); + if (!pkvm_hyp_vm_is_created(kvm)) + ret = __pkvm_create_hyp_vm(kvm); + mutex_unlock(&kvm->arch.config_lock); return ret; } -void pkvm_destroy_hyp_vm(struct kvm *host_kvm) +int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) { - if (host_kvm->arch.pkvm.handle) { - WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, - host_kvm->arch.pkvm.handle)); + int ret = 0; + + mutex_lock(&vcpu->kvm->arch.config_lock); + if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED)) + ret = __pkvm_create_hyp_vcpu(vcpu); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return ret; +} + +void pkvm_destroy_hyp_vm(struct kvm *kvm) +{ + mutex_lock(&kvm->arch.config_lock); + __pkvm_destroy_hyp_vm(kvm); + mutex_unlock(&kvm->arch.config_lock); +} + +int pkvm_init_host_vm(struct kvm *kvm) +{ + int ret; + + if (pkvm_hyp_vm_is_created(kvm)) + return -EINVAL; + + /* VM is already reserved, no need to proceed. */ + if (kvm->arch.pkvm.handle) + return 0; + + /* Reserve the VM in hyp and obtain a hyp handle for the VM. */ + ret = kvm_call_hyp_nvhe(__pkvm_reserve_vm); + if (ret < 0) + return ret; + + kvm->arch.pkvm.handle = ret; + + return 0; +} + +static void __init _kvm_host_prot_finalize(void *arg) +{ + int *err = arg; + + if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) + WRITE_ONCE(*err, -EINVAL); +} + +static int __init pkvm_drop_host_privileges(void) +{ + int ret = 0; + + /* + * Flip the static key upfront as that may no longer be possible + * once the host stage 2 is installed. + */ + static_branch_enable(&kvm_protected_mode_initialized); + on_each_cpu(_kvm_host_prot_finalize, &ret, 1); + return ret; +} + +static int __init finalize_pkvm(void) +{ + int ret; + + if (!is_protected_kvm_enabled() || !is_kvm_arm_initialised()) + return 0; + + /* + * Exclude HYP sections from kmemleak so that they don't get peeked + * at, which would end badly once inaccessible. + */ + kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); + kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start); + kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start); + kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); + + ret = pkvm_drop_host_privileges(); + if (ret) + pr_err("Failed to finalize Hyp protection: %d\n", ret); + + return ret; +} +device_initcall_sync(finalize_pkvm); + +static u64 __pkvm_mapping_start(struct pkvm_mapping *m) +{ + return m->gfn * PAGE_SIZE; +} + +static u64 __pkvm_mapping_end(struct pkvm_mapping *m) +{ + return (m->gfn + m->nr_pages) * PAGE_SIZE - 1; +} + +INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last, + __pkvm_mapping_start, __pkvm_mapping_end, static, + pkvm_mapping); + +/* + * __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow + * freeing of __map inline. + */ +#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \ + for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings, \ + __start, __end - 1); \ + __tmp && ({ \ + __map = __tmp; \ + __tmp = pkvm_mapping_iter_next(__map, __start, __end - 1); \ + true; \ + }); \ + ) + +int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) +{ + pgt->pkvm_mappings = RB_ROOT_CACHED; + pgt->mmu = mmu; + + return 0; +} + +static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret; + + if (!handle) + return 0; + + for_each_mapping_in_range_safe(pgt, start, end, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn, + mapping->nr_pages); + if (WARN_ON(ret)) + return ret; + pkvm_mapping_remove(mapping, &pgt->pkvm_mappings); + kfree(mapping); } - host_kvm->arch.pkvm.handle = 0; - free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); + return 0; +} + +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) +{ + __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); +} + +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + /* Expected to be called after all pKVM mappings have been released. */ + WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); } -int pkvm_init_host_vm(struct kvm *host_kvm) +int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + void *mc, enum kvm_pgtable_walk_flags flags) { - mutex_init(&host_kvm->lock); + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping = NULL; + struct kvm_hyp_memcache *cache = mc; + u64 gfn = addr >> PAGE_SHIFT; + u64 pfn = phys >> PAGE_SHIFT; + int ret; + + if (size != PAGE_SIZE && size != PMD_SIZE) + return -EINVAL; + + lockdep_assert_held_write(&kvm->mmu_lock); + + /* + * Calling stage2_map() on top of existing mappings is either happening because of a race + * with another vCPU, or because we're changing between page and block mappings. As per + * user_mem_abort(), same-size permission faults are handled in the relax_perms() path. + */ + mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1); + if (mapping) { + if (size == (mapping->nr_pages * PAGE_SIZE)) + return -EAGAIN; + + /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */ + ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); + if (ret) + return ret; + mapping = NULL; + } + + ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot); + if (WARN_ON(ret)) + return ret; + + swap(mapping, cache->mapping); + mapping->gfn = gfn; + mapping->pfn = pfn; + mapping->nr_pages = size / PAGE_SIZE; + pkvm_mapping_insert(mapping, &pgt->pkvm_mappings); + + return ret; +} + +int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock); + + return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); +} + +int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret = 0; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn, + mapping->nr_pages); + if (WARN_ON(ret)) + break; + } + + return ret; +} + +int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), + PAGE_SIZE * mapping->nr_pages); + return 0; } + +bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + bool young = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, + mapping->nr_pages, mkold); + + return young; +} + +int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, + enum kvm_pgtable_walk_flags flags) +{ + return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot); +} + +void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) +{ + WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT)); +} + +void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) +{ + WARN_ON_ONCE(1); +} + +kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, + enum kvm_pgtable_prot prot, void *mc, bool force_pte) +{ + WARN_ON_ONCE(1); + return NULL; +} + +int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 24908400e190..b03dbda7f1ab 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -17,13 +17,18 @@ #define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0) -DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); - static LIST_HEAD(arm_pmus); static DEFINE_MUTEX(arm_pmus_lock); static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); +static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc); + +bool kvm_supports_guest_pmuv3(void) +{ + guard(mutex)(&arm_pmus_lock); + return !list_empty(&arm_pmus); +} static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) { @@ -35,12 +40,8 @@ static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx) return &vcpu->arch.pmu.pmc[cnt_idx]; } -static u32 kvm_pmu_event_mask(struct kvm *kvm) +static u32 __kvm_pmu_event_mask(unsigned int pmuver) { - unsigned int pmuver; - - pmuver = kvm->arch.arm_pmu->pmuver; - switch (pmuver) { case ID_AA64DFR0_EL1_PMUVer_IMP: return GENMASK(9, 0); @@ -55,19 +56,49 @@ static u32 kvm_pmu_event_mask(struct kvm *kvm) } } +static u32 kvm_pmu_event_mask(struct kvm *kvm) +{ + u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); + u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0); + + return __kvm_pmu_event_mask(pmuver); +} + +u64 kvm_pmu_evtyper_mask(struct kvm *kvm) +{ + u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 | + kvm_pmu_event_mask(kvm); + + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP)) + mask |= ARMV8_PMU_INCLUDE_EL2; + + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) + mask |= ARMV8_PMU_EXCLUDE_NS_EL0 | + ARMV8_PMU_EXCLUDE_NS_EL1 | + ARMV8_PMU_EXCLUDE_EL3; + + return mask; +} + /** * kvm_pmc_is_64bit - determine if counter is 64bit * @pmc: counter context */ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) { + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + return (pmc->idx == ARMV8_PMU_CYCLE_IDX || - kvm_pmu_is_3p5(kvm_pmc_to_vcpu(pmc))); + kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)); } static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { - u64 val = __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), PMCR_EL0); + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 val = kvm_vcpu_read_pmcr(vcpu); + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP; return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); @@ -89,6 +120,11 @@ static u32 counter_index_to_evtreg(u64 idx) return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; } +static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc) +{ + return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx)); +} + static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); @@ -118,9 +154,6 @@ static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) */ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); } @@ -145,7 +178,7 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) val |= lower_32_bits(val); } - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); /* Recreate the perf event to reflect the updated sample_period */ kvm_pmu_create_perf_event(pmc); @@ -159,13 +192,23 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) */ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { - if (!kvm_vcpu_has_pmu(vcpu)) - return; - kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false); } /** + * kvm_pmu_set_counter_value_user - set PMU counter value from user + * @vcpu: The vcpu pointer + * @select_idx: The counter index + * @val: The counter value + */ +void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) +{ + kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); + __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(select_idx), val); + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); +} + +/** * kvm_pmu_release_perf_event - remove the perf event * @pmc: The PMU counter pointer */ @@ -196,7 +239,7 @@ static void kvm_pmu_stop_counter(struct kvm_pmc *pmc) reg = counter_index_to_reg(pmc->idx); - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); kvm_pmu_release_perf_event(pmc); } @@ -211,25 +254,11 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) pmu->pmc[i].idx = i; } /** - * kvm_pmu_vcpu_reset - reset pmu state for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) -{ - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); - int i; - - for_each_set_bit(i, &mask, 32) - kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i)); -} - -/** * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu * @vcpu: The vcpu pointer * @@ -238,92 +267,128 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); irq_work_sync(&vcpu->arch.pmu.overflow_work); } -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) +static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu) { - u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT; + unsigned int hpmn, n; + + if (!vcpu_has_nv(vcpu)) + return 0; + + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + n = vcpu->kvm->arch.nr_pmu_counters; + + /* + * Programming HPMN to a value greater than PMCR_EL0.N is + * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an + * UNKNOWN number of counters (in our case, zero) are reserved for EL2. + */ + if (hpmn >= n) + return 0; + + /* + * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't + * implemented. Since KVM's ability to emulate HPMN=0 does not directly + * depend on hardware (all PMU registers are trapped), make the + * implementation choice that all counters are included in the second + * range reserved for EL2/EL3. + */ + return GENMASK(n - 1, hpmn); +} + +bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) +{ + return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx); +} + +u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + + if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) + return mask; + + return mask & ~kvm_pmu_hyp_counter_mask(vcpu); +} + +u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); - val &= ARMV8_PMU_PMCR_N_MASK; if (val == 0) return BIT(ARMV8_PMU_CYCLE_IDX); else return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); } -/** - * kvm_pmu_enable_counter_mask - enable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENSET register - * - * Call perf_event_enable to start counting the perf event - */ -void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc) { - int i; - if (!kvm_vcpu_has_pmu(vcpu)) + if (!pmc->perf_event) { + kvm_pmu_create_perf_event(pmc); return; + } - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) - return; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc; - - if (!(val & BIT(i))) - continue; - - pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + perf_event_enable(pmc->perf_event); + if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) + kvm_debug("fail to enable perf event\n"); +} - if (!pmc->perf_event) { - kvm_pmu_create_perf_event(pmc); - } else { - perf_event_enable(pmc->perf_event); - if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) - kvm_debug("fail to enable perf event\n"); - } - } +static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) + perf_event_disable(pmc->perf_event); } -/** - * kvm_pmu_disable_counter_mask - disable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENCLR register - * - * Call perf_event_disable to stop counting the perf event - */ -void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; - if (!kvm_vcpu_has_pmu(vcpu) || !val) + if (!val) return; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc; + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); if (!(val & BIT(i))) continue; - pmc = kvm_vcpu_idx_to_pmc(vcpu, i); - - if (pmc->perf_event) - perf_event_disable(pmc->perf_event); + if (kvm_pmu_counter_is_enabled(pmc)) + kvm_pmc_enable_perf_event(pmc); + else + kvm_pmc_disable_perf_event(pmc); } + + kvm_vcpu_pmu_restore_guest(vcpu); } -static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) +/* + * Returns the PMU overflow state, which is true if there exists an event + * counter where the values of the global enable control, PMOVSSET_EL0[n], and + * PMINTENSET_EL1[n] are all 1. + */ +static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) { - u64 reg = 0; + u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); - if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) { - reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); - reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); - } + reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); + + /* + * PMCR_EL0.E is the global enable control for event counters available + * to EL0 and EL1. + */ + if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) + reg &= kvm_pmu_hyp_counter_mask(vcpu); + + /* + * Otherwise, MDCR_EL2.HPME is the global enable control for event + * counters reserved for EL2. + */ + if (!(vcpu_read_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HPME)) + reg &= ~kvm_pmu_hyp_counter_mask(vcpu); return reg; } @@ -333,17 +398,14 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; bool overflow; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - - overflow = !!kvm_pmu_overflow_status(vcpu); + overflow = kvm_pmu_overflow_status(vcpu); if (pmu->irq_level == overflow) return; pmu->irq_level = overflow; if (likely(irqchip_in_kernel(vcpu->kvm))) { - int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, pmu->irq_num, overflow, pmu); WARN_ON(ret); } @@ -398,7 +460,7 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) kvm_pmu_update_state(vcpu); } -/** +/* * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding * to the event. * This is why we need a callback to do it once outside of the NMI context. @@ -421,7 +483,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, { int i; - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) + if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) return; /* Weed out disabled counters */ @@ -441,14 +503,14 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1; if (!kvm_pmc_is_64bit(pmc)) reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg; + __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(i), reg); /* No overflow? move on */ if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg)) continue; /* Mark overflow */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(i)); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(i + 1), @@ -469,7 +531,7 @@ static u64 compute_period(struct kvm_pmc *pmc, u64 counter) return val; } -/** +/* * When the perf event overflows, set the overflow status and inform the vcpu. */ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, @@ -494,7 +556,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, perf_event->attr.sample_period = period; perf_event->hw.sample_period = period; - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(idx)); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(idx + 1), @@ -531,29 +593,27 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { int i; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ - if (!kvm_pmu_is_3p5(vcpu)) + if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) val &= ~ARMV8_PMU_PMCR_LP; - __vcpu_sys_reg(vcpu, PMCR_EL0) = val; + /* Request a reload of the PMU to enable/disable affected counters */ + if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); - if (val & ARMV8_PMU_PMCR_E) { - kvm_pmu_enable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); - } else { - kvm_pmu_disable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); - } + /* The reset bits don't indicate any state, and shouldn't be saved. */ + __vcpu_assign_sys_reg(vcpu, PMCR_EL0, (val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P))); if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); - mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); + unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) & + ~BIT(ARMV8_PMU_CYCLE_IDX); + + if (!vcpu_is_el2(vcpu)) + mask &= ~kvm_pmu_hyp_counter_mask(vcpu); + for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } @@ -562,8 +622,58 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && - (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); + unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx))) + return false; + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return mdcr & MDCR_EL2_HPME; + + return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E; +} + +static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0; + bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0; + + return u == nsu; +} + +static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1; + bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1; + + return p == nsk; +} + +static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD)) + return false; + + return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2; +} + +static int kvm_map_pmu_event(struct kvm *kvm, unsigned int eventsel) +{ + struct arm_pmu *pmu = kvm->arch.arm_pmu; + + /* + * The CPU PMU likely isn't PMUv3; let the driver provide a mapping + * for the guest's PMUv3 event ID. + */ + if (unlikely(pmu->map_pmuv3_event)) + return pmu->map_pmuv3_event(eventsel); + + return eventsel; } /** @@ -576,16 +686,16 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct perf_event *event; struct perf_event_attr attr; - u64 eventsel, reg, data; + int eventsel; + u64 evtreg; - reg = counter_index_to_evtreg(pmc->idx); - data = __vcpu_sys_reg(vcpu, reg); + evtreg = kvm_pmc_read_evtreg(pmc); kvm_pmu_stop_counter(pmc); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; else - eventsel = data & kvm_pmu_event_mask(vcpu->kvm); + eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm); /* * Neither SW increment nor chained events need to be backed @@ -603,18 +713,34 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; + /* + * Don't create an event if we're running on hardware that requires + * PMUv3 event translation and we couldn't find a valid mapping. + */ + eventsel = kvm_map_pmu_event(vcpu->kvm, eventsel); + if (eventsel < 0) + return; + memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; attr.disabled = !kvm_pmu_counter_is_enabled(pmc); - attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0; - attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; + attr.exclude_user = !kvm_pmc_counts_at_el0(pmc); attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; /* + * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the + * guest's EL2 filter. + */ + if (unlikely(is_hyp_ctxt(vcpu))) + attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc); + else + attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); + + /* * If counting with a 64bit counter, advertise it to the perf * code, carefully dealing with the initial sample period * which also depends on the overflow. @@ -650,18 +776,10 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx); - u64 reg, mask; - - if (!kvm_vcpu_has_pmu(vcpu)) - return; - - mask = ARMV8_PMU_EVTYPE_MASK; - mask &= ~ARMV8_PMU_EVTYPE_EVENT; - mask |= kvm_pmu_event_mask(vcpu->kvm); + u64 reg; reg = counter_index_to_evtreg(pmc->idx); - - __vcpu_sys_reg(vcpu, reg) = data & mask; + __vcpu_assign_sys_reg(vcpu, reg, (data & kvm_pmu_evtyper_mask(vcpu->kvm))); kvm_pmu_create_perf_event(pmc); } @@ -670,93 +788,106 @@ void kvm_host_pmu_init(struct arm_pmu *pmu) { struct arm_pmu_entry *entry; - if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI || - pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + /* + * Check the sanitised PMU version for the system, as KVM does not + * support implementations where PMUv3 exists on a subset of CPUs. + */ + if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit())) return; - mutex_lock(&arm_pmus_lock); + guard(mutex)(&arm_pmus_lock); entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) - goto out_unlock; + return; entry->arm_pmu = pmu; list_add_tail(&entry->entry, &arm_pmus); - - if (list_is_singular(&arm_pmus)) - static_branch_enable(&kvm_arm_pmu_available); - -out_unlock: - mutex_unlock(&arm_pmus_lock); } static struct arm_pmu *kvm_pmu_probe_armpmu(void) { - struct perf_event_attr attr = { }; - struct perf_event *event; - struct arm_pmu *pmu = NULL; + struct arm_pmu_entry *entry; + struct arm_pmu *pmu; + int cpu; + + guard(mutex)(&arm_pmus_lock); /* - * Create a dummy event that only counts user cycles. As we'll never - * leave this function with the event being live, it will never - * count anything. But it allows us to probe some of the PMU - * details. Yes, this is terrible. + * It is safe to use a stale cpu to iterate the list of PMUs so long as + * the same value is used for the entirety of the loop. Given this, and + * the fact that no percpu data is used for the lookup there is no need + * to disable preemption. + * + * It is still necessary to get a valid cpu, though, to probe for the + * default PMU instance as userspace is not required to specify a PMU + * type. In order to uphold the preexisting behavior KVM selects the + * PMU instance for the core during vcpu init. A dependent use + * case would be a user with disdain of all things big.LITTLE that + * affines the VMM to a particular cluster of cores. + * + * In any case, userspace should just do the sane thing and use the UAPI + * to select a PMU type directly. But, be wary of the baggage being + * carried here. */ - attr.type = PERF_TYPE_RAW; - attr.size = sizeof(attr); - attr.pinned = 1; - attr.disabled = 0; - attr.exclude_user = 0; - attr.exclude_kernel = 1; - attr.exclude_hv = 1; - attr.exclude_host = 1; - attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; - attr.sample_period = GENMASK(63, 0); - - event = perf_event_create_kernel_counter(&attr, -1, current, - kvm_pmu_perf_overflow, &attr); + cpu = raw_smp_processor_id(); + list_for_each_entry(entry, &arm_pmus, entry) { + pmu = entry->arm_pmu; - if (IS_ERR(event)) { - pr_err_once("kvm: pmu event creation failed %ld\n", - PTR_ERR(event)); - return NULL; + if (cpumask_test_cpu(cpu, &pmu->supported_cpus)) + return pmu; } - if (event->pmu) { - pmu = to_arm_pmu(event->pmu); - if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI || - pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) - pmu = NULL; - } + return NULL; +} + +static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1) +{ + u32 hi[2], lo[2]; + + bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); + bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); - perf_event_disable(event); - perf_event_release_kernel(event); + return ((u64)hi[pmceid1] << 32) | lo[pmceid1]; +} + +static u64 compute_pmceid0(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 0); - return pmu; + /* always support SW_INCR */ + val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR); + /* always support CHAIN */ + val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); + return val; +} + +static u64 compute_pmceid1(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 1); + + /* + * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled + * as RAZ + */ + val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); + return val; } u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) { + struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu; unsigned long *bmap = vcpu->kvm->arch.pmu_filter; u64 val, mask = 0; int base, i, nr_events; - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - if (!pmceid1) { - val = read_sysreg(pmceid0_el0); - /* always support CHAIN */ - val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); + val = compute_pmceid0(cpu_pmu); base = 0; } else { - val = read_sysreg(pmceid1_el0); - /* - * Don't advertise STALL_SLOT, as PMMIR_EL0 is handled - * as RAZ - */ - if (vcpu->kvm->arch.arm_pmu->pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4) - val &= ~BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32); + val = compute_pmceid1(cpu_pmu); base = 32; } @@ -779,11 +910,19 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return val & mask; } -int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) +void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, mask); + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, mask); + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, mask); + + kvm_pmu_reprogram_counter_mask(vcpu, mask); +} + +int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) +{ if (!vcpu->arch.pmu.created) return -EINVAL; @@ -806,9 +945,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) return -EINVAL; } - /* One-off reload of the PMU on first run */ - kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); - return 0; } @@ -867,6 +1003,77 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) return true; } +/** + * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters. + * @kvm: The kvm pointer + */ +u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; + + /* + * PMUv3 requires that all event counters are capable of counting any + * event, though the same may not be true of non-PMUv3 hardware. + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return 1; + + /* + * The arm_pmu->cntr_mask considers the fixed counter(s) as well. + * Ignore those and return only the general-purpose counters. + */ + return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); +} + +static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr) +{ + kvm->arch.nr_pmu_counters = nr; + + /* Reset MDCR_EL2.HPMN behind the vcpus' back... */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2, kvm->arch.vcpu_features)) { + struct kvm_vcpu *vcpu; + unsigned long i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2); + val &= ~MDCR_EL2_HPMN; + val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters); + __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); + } + } +} + +static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) +{ + lockdep_assert_held(&kvm->arch.config_lock); + + kvm->arch.arm_pmu = arm_pmu; + kvm_arm_set_nr_counters(kvm, kvm_arm_pmu_get_max_counters(kvm)); +} + +/** + * kvm_arm_set_default_pmu - No PMU set, get the default one. + * @kvm: The kvm pointer + * + * The observant among you will notice that the supported_cpus + * mask does not get updated for the default PMU even though it + * is quite possible the selected instance supports only a + * subset of cores in the system. This is intentional, and + * upholds the preexisting behavior on heterogeneous systems + * where vCPUs can be scheduled on any core but the guest + * counters could stop working. + */ +int kvm_arm_set_default_pmu(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu(); + + if (!arm_pmu) + return -ENODEV; + + kvm_arm_set_pmu(kvm, arm_pmu); + return 0; +} + static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) { struct kvm *kvm = vcpu->kvm; @@ -874,19 +1081,19 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) struct arm_pmu *arm_pmu; int ret = -ENXIO; - mutex_lock(&kvm->lock); + lockdep_assert_held(&kvm->arch.config_lock); mutex_lock(&arm_pmus_lock); list_for_each_entry(entry, &arm_pmus, entry) { arm_pmu = entry->arm_pmu; if (arm_pmu->pmu.type == pmu_id) { - if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) || + if (kvm_vm_has_ran_once(kvm) || (kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) { ret = -EBUSY; break; } - kvm->arch.arm_pmu = arm_pmu; + kvm_arm_set_pmu(kvm, arm_pmu); cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus); ret = 0; break; @@ -894,31 +1101,35 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) } mutex_unlock(&arm_pmus_lock); - mutex_unlock(&kvm->lock); return ret; } +static int kvm_arm_pmu_v3_set_nr_counters(struct kvm_vcpu *vcpu, unsigned int n) +{ + struct kvm *kvm = vcpu->kvm; + + if (!kvm->arch.arm_pmu) + return -EINVAL; + + if (n > kvm_arm_pmu_get_max_counters(kvm)) + return -EINVAL; + + kvm_arm_set_nr_counters(kvm, n); + return 0; +} + int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { struct kvm *kvm = vcpu->kvm; + lockdep_assert_held(&kvm->arch.config_lock); + if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (vcpu->arch.pmu.created) return -EBUSY; - mutex_lock(&kvm->lock); - if (!kvm->arch.arm_pmu) { - /* No PMU set, get the default one */ - kvm->arch.arm_pmu = kvm_pmu_probe_armpmu(); - if (!kvm->arch.arm_pmu) { - mutex_unlock(&kvm->lock); - return -ENODEV; - } - } - mutex_unlock(&kvm->lock); - switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; @@ -945,11 +1156,17 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) return 0; } case KVM_ARM_VCPU_PMU_V3_FILTER: { + u8 pmuver = kvm_arm_pmu_get_pmuver_limit(); struct kvm_pmu_event_filter __user *uaddr; struct kvm_pmu_event_filter filter; int nr_events; - nr_events = kvm_pmu_event_mask(kvm) + 1; + /* + * Allow userspace to specify an event filter for the entire + * event range supported by PMUVer of the hardware, rather + * than the guest's PMUVer for KVM backward compatibility. + */ + nr_events = __kvm_pmu_event_mask(pmuver) + 1; uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; @@ -961,19 +1178,13 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) filter.action != KVM_PMU_EVENT_DENY)) return -EINVAL; - mutex_lock(&kvm->lock); - - if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags)) { - mutex_unlock(&kvm->lock); + if (kvm_vm_has_ran_once(kvm)) return -EBUSY; - } if (!kvm->arch.pmu_filter) { kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); - if (!kvm->arch.pmu_filter) { - mutex_unlock(&kvm->lock); + if (!kvm->arch.pmu_filter) return -ENOMEM; - } /* * The default depends on the first applied filter. @@ -992,8 +1203,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) else bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents); - mutex_unlock(&kvm->lock); - return 0; } case KVM_ARM_VCPU_PMU_V3_SET_PMU: { @@ -1005,6 +1214,15 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id); } + case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: { + unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr; + unsigned int n; + + if (get_user(n, uaddr)) + return -EFAULT; + + return kvm_arm_pmu_v3_set_nr_counters(vcpu, n); + } case KVM_ARM_VCPU_PMU_V3_INIT: return kvm_arm_pmu_v3_init(vcpu); } @@ -1043,6 +1261,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) case KVM_ARM_VCPU_PMU_V3_INIT: case KVM_ARM_VCPU_PMU_V3_FILTER: case KVM_ARM_VCPU_PMU_V3_SET_PMU: + case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: if (kvm_vcpu_has_pmu(vcpu)) return 0; } @@ -1052,11 +1271,65 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) u8 kvm_arm_pmu_get_pmuver_limit(void) { - u64 tmp; + unsigned int pmuver; + + pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, + read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); + + /* + * Spoof a barebones PMUv3 implementation if the system supports IMPDEF + * traps of the PMUv3 sysregs + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return ID_AA64DFR0_EL1_PMUVer_IMP; + + /* + * Otherwise, treat IMPLEMENTATION DEFINED functionality as + * unimplemented + */ + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return 0; + + return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5); +} + +/** + * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU + * @vcpu: The vcpu pointer + */ +u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) +{ + u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); + u64 n = vcpu->kvm->arch.nr_pmu_counters; + + if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu)) + n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + + return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N); +} + +void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) +{ + bool reprogrammed = false; + unsigned long mask; + int i; + + mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + for_each_set_bit(i, &mask, 32) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + + /* + * We only need to reconfigure events where the filter is + * different at EL1 vs. EL2, as we're multiplexing the true EL1 + * event filter bit for nested. + */ + if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc)) + continue; + + kvm_pmu_create_perf_event(pmc); + reprogrammed = true; + } - tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); - tmp = cpuid_feature_cap_perfmon_field(tmp, - ID_AA64DFR0_EL1_PMUVer_SHIFT, - ID_AA64DFR0_EL1_PMUVer_V3P5); - return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); + if (reprogrammed) + kvm_vcpu_pmu_restore_guest(vcpu); } diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 7887133d15f0..6b48a3d16d0d 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -5,6 +5,8 @@ */ #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <linux/perf/arm_pmu.h> +#include <linux/perf/arm_pmuv3.h> static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); @@ -35,11 +37,11 @@ struct kvm_pmu_events *kvm_get_pmu_events(void) * Add events to track that we may want to switch at guest entry/exit * time. */ -void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr)) + if (!system_supports_pmuv3() || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) @@ -51,90 +53,43 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) /* * Stop tracking events */ -void kvm_clr_pmu_events(u32 clr) +void kvm_clr_pmu_events(u64 clr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !pmu) + if (!system_supports_pmuv3()) return; pmu->events_host &= ~clr; pmu->events_guest &= ~clr; } -#define PMEVTYPER_READ_CASE(idx) \ - case idx: \ - return read_sysreg(pmevtyper##idx##_el0) - -#define PMEVTYPER_WRITE_CASE(idx) \ - case idx: \ - write_sysreg(val, pmevtyper##idx##_el0); \ - break - -#define PMEVTYPER_CASES(readwrite) \ - PMEVTYPER_##readwrite##_CASE(0); \ - PMEVTYPER_##readwrite##_CASE(1); \ - PMEVTYPER_##readwrite##_CASE(2); \ - PMEVTYPER_##readwrite##_CASE(3); \ - PMEVTYPER_##readwrite##_CASE(4); \ - PMEVTYPER_##readwrite##_CASE(5); \ - PMEVTYPER_##readwrite##_CASE(6); \ - PMEVTYPER_##readwrite##_CASE(7); \ - PMEVTYPER_##readwrite##_CASE(8); \ - PMEVTYPER_##readwrite##_CASE(9); \ - PMEVTYPER_##readwrite##_CASE(10); \ - PMEVTYPER_##readwrite##_CASE(11); \ - PMEVTYPER_##readwrite##_CASE(12); \ - PMEVTYPER_##readwrite##_CASE(13); \ - PMEVTYPER_##readwrite##_CASE(14); \ - PMEVTYPER_##readwrite##_CASE(15); \ - PMEVTYPER_##readwrite##_CASE(16); \ - PMEVTYPER_##readwrite##_CASE(17); \ - PMEVTYPER_##readwrite##_CASE(18); \ - PMEVTYPER_##readwrite##_CASE(19); \ - PMEVTYPER_##readwrite##_CASE(20); \ - PMEVTYPER_##readwrite##_CASE(21); \ - PMEVTYPER_##readwrite##_CASE(22); \ - PMEVTYPER_##readwrite##_CASE(23); \ - PMEVTYPER_##readwrite##_CASE(24); \ - PMEVTYPER_##readwrite##_CASE(25); \ - PMEVTYPER_##readwrite##_CASE(26); \ - PMEVTYPER_##readwrite##_CASE(27); \ - PMEVTYPER_##readwrite##_CASE(28); \ - PMEVTYPER_##readwrite##_CASE(29); \ - PMEVTYPER_##readwrite##_CASE(30) - /* * Read a value direct from PMEVTYPER<idx> where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) { - switch (idx) { - PMEVTYPER_CASES(READ); - case ARMV8_PMU_CYCLE_IDX: - return read_sysreg(pmccfiltr_el0); - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + return read_pmccfiltr(); + else if (idx == ARMV8_PMU_INSTR_IDX) + return read_pmicfiltr(); - return 0; + return read_pmevtypern(idx); } /* * Write a value direct to PMEVTYPER<idx> where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val) { - switch (idx) { - PMEVTYPER_CASES(WRITE); - case ARMV8_PMU_CYCLE_IDX: - write_sysreg(val, pmccfiltr_el0); - break; - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + write_pmccfiltr(val); + else if (idx == ARMV8_PMU_INSTR_IDX) + write_pmicfiltr(val); + else + write_pmevtypern(idx, val); } /* @@ -145,7 +100,7 @@ static void kvm_vcpu_pmu_enable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer &= ~ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); @@ -160,7 +115,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer |= ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); @@ -176,9 +131,9 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; - if (!kvm_arm_support_pmu_v3() || !has_vhe()) + if (!system_supports_pmuv3() || !has_vhe()) return; preempt_disable(); @@ -197,9 +152,9 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; - if (!kvm_arm_support_pmu_v3() || !has_vhe()) + if (!system_supports_pmuv3() || !has_vhe()) return; pmu = kvm_get_pmu_events(); @@ -209,3 +164,48 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) kvm_vcpu_pmu_enable_el0(events_host); kvm_vcpu_pmu_disable_el0(events_guest); } + +/* + * With VHE, keep track of the PMUSERENR_EL0 value for the host EL0 on the pCPU + * where PMUSERENR_EL0 for the guest is loaded, since PMUSERENR_EL0 is switched + * to the value for the guest on vcpu_load(). The value for the host EL0 + * will be restored on vcpu_put(), before returning to userspace. + * This isn't necessary for nVHE, as the register is context switched for + * every guest enter/exit. + * + * Return true if KVM takes care of the register. Otherwise return false. + */ +bool kvm_set_pmuserenr(u64 val) +{ + struct kvm_cpu_context *hctxt; + struct kvm_vcpu *vcpu; + + if (!system_supports_pmuv3() || !has_vhe()) + return false; + + vcpu = kvm_get_running_vcpu(); + if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU)) + return false; + + hctxt = host_data_ptr(host_ctxt); + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val; + return true; +} + +/* + * If we interrupted the guest to update the host PMU context, make + * sure we re-apply the guest EL0 state. + */ +void kvm_vcpu_pmu_resync_el0(void) +{ + struct kvm_vcpu *vcpu; + + if (!has_vhe() || !in_interrupt()) + return; + + vcpu = kvm_get_running_vcpu(); + if (!vcpu) + return; + + kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu); +} diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 7fbc4c1b9df0..3b5dbe9a0a0e 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -62,6 +62,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) struct vcpu_reset_state *reset_state; struct kvm *kvm = source_vcpu->kvm; struct kvm_vcpu *vcpu = NULL; + int ret = PSCI_RET_SUCCESS; unsigned long cpu_id; cpu_id = smccc_get_arg1(source_vcpu); @@ -76,11 +77,15 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ if (!vcpu) return PSCI_RET_INVALID_PARAMS; + + spin_lock(&vcpu->arch.mp_state_lock); if (!kvm_arm_vcpu_stopped(vcpu)) { if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) - return PSCI_RET_ALREADY_ON; + ret = PSCI_RET_ALREADY_ON; else - return PSCI_RET_INVALID_PARAMS; + ret = PSCI_RET_INVALID_PARAMS; + + goto out_unlock; } reset_state = &vcpu->arch.reset_state; @@ -96,7 +101,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ reset_state->r0 = smccc_get_arg3(source_vcpu); - WRITE_ONCE(reset_state->reset, true); + reset_state->reset = true; kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); /* @@ -105,10 +110,12 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ smp_wmb(); - vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); kvm_vcpu_wake_up(vcpu); - return PSCI_RET_SUCCESS; +out_unlock: + spin_unlock(&vcpu->arch.mp_state_lock); + return ret; } static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) @@ -168,8 +175,11 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags) * after this call is handled and before the VCPUs have been * re-initialized. */ - kvm_for_each_vcpu(i, tmp, vcpu->kvm) - tmp->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { + spin_lock(&tmp->arch.mp_state_lock); + WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); + spin_unlock(&tmp->arch.mp_state_lock); + } kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); @@ -184,6 +194,12 @@ static void kvm_psci_system_off(struct kvm_vcpu *vcpu) kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, 0); } +static void kvm_psci_system_off2(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, + KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2); +} + static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) { kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, 0); @@ -229,7 +245,6 @@ static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; u32 psci_fn = smccc_get_function(vcpu); unsigned long val; int ret = 1; @@ -254,9 +269,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) kvm_psci_narrow_to_32bit(vcpu); fallthrough; case PSCI_0_2_FN64_CPU_ON: - mutex_lock(&kvm->lock); val = kvm_psci_vcpu_on(vcpu); - mutex_unlock(&kvm->lock); break; case PSCI_0_2_FN_AFFINITY_INFO: kvm_psci_narrow_to_32bit(vcpu); @@ -315,7 +328,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) switch(psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: - val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1; + val = PSCI_VERSION(1, minor); break; case PSCI_1_0_FN_PSCI_FEATURES: arg = smccc_get_arg1(vcpu); @@ -351,6 +364,11 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) if (minor >= 1) val = 0; break; + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor >= 3) + val = PSCI_1_3_OFF_TYPE_HIBERNATE_OFF; + break; } break; case PSCI_1_0_FN_SYSTEM_SUSPEND: @@ -385,6 +403,33 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) break; } break; + case PSCI_1_3_FN_SYSTEM_OFF2: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor < 3) + break; + + arg = smccc_get_arg1(vcpu); + /* + * SYSTEM_OFF2 defaults to HIBERNATE_OFF if arg1 is zero. arg2 + * must be zero. + */ + if ((arg && arg != PSCI_1_3_OFF_TYPE_HIBERNATE_OFF) || + smccc_get_arg2(vcpu) != 0) { + val = PSCI_RET_INVALID_PARAMS; + break; + } + kvm_psci_system_off2(vcpu); + /* + * We shouldn't be going back to the guest after receiving a + * SYSTEM_OFF2 request. Preload a return value of + * INTERNAL_FAILURE should userspace ignore the exit and resume + * the vCPU. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; default: return kvm_psci_0_2_call(vcpu); } @@ -395,7 +440,6 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; u32 psci_fn = smccc_get_function(vcpu); unsigned long val; @@ -405,9 +449,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) val = PSCI_RET_SUCCESS; break; case KVM_PSCI_FN_CPU_ON: - mutex_lock(&kvm->lock); val = kvm_psci_vcpu_on(vcpu); - mutex_unlock(&kvm->lock); break; default: val = PSCI_RET_NOT_SUPPORTED; @@ -435,6 +477,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) int kvm_psci_call(struct kvm_vcpu *vcpu) { u32 psci_fn = smccc_get_function(vcpu); + int version = kvm_psci_version(vcpu); unsigned long val; val = kvm_psci_check_allowed_function(vcpu, psci_fn); @@ -443,7 +486,11 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) return 1; } - switch (kvm_psci_version(vcpu)) { + switch (version) { + case KVM_ARM_PSCI_1_3: + return kvm_psci_1_x_call(vcpu, 3); + case KVM_ARM_PSCI_1_2: + return kvm_psci_1_x_call(vcpu, 2); case KVM_ARM_PSCI_1_1: return kvm_psci_1_x_call(vcpu, 1); case KVM_ARM_PSCI_1_0: @@ -453,6 +500,8 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) case KVM_ARM_PSCI_0_1: return kvm_psci_0_1_call(vcpu); default: - return -EINVAL; + WARN_ONCE(1, "Unknown PSCI version %d", version); + smccc_set_retval(vcpu, SMCCC_RET_NOT_SUPPORTED, 0, 0, 0); + return 1; } } diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c new file mode 100644 index 000000000000..6cbe018fd6fd --- /dev/null +++ b/arch/arm64/kvm/ptdump.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Debug helper used to dump the stage-2 pagetables of the system and their + * associated permissions. + * + * Copyright (C) Google, 2024 + * Author: Sebastian Ene <sebastianene@google.com> + */ +#include <linux/debugfs.h> +#include <linux/kvm_host.h> +#include <linux/seq_file.h> + +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/ptdump.h> + +#define MARKERS_LEN 2 +#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1) + +struct kvm_ptdump_guest_state { + struct kvm *kvm; + struct ptdump_pg_state parser_state; + struct addr_marker ipa_marker[MARKERS_LEN]; + struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS]; + struct ptdump_range range[MARKERS_LEN]; +}; + +static const struct ptdump_prot_bits stage2_pte_bits[] = { + { + .mask = PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "F", + }, + { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, + .set = "R", + .clear = " ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, + .set = "W", + .clear = " ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b00UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px ux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b01UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b10UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNUXN", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b11UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px UXN", + }, + { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF, + .val = KVM_PTE_LEAF_ATTR_LO_S2_AF, + .set = "AF", + .clear = " ", + }, + { + .mask = PMD_TYPE_MASK, + .val = PMD_TYPE_SECT, + .set = "BLK", + .clear = " ", + }, +}; + +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct ptdump_pg_state *st = ctx->arg; + struct ptdump_state *pt_st = &st->ptdump; + + note_page(pt_st, ctx->addr, ctx->level, ctx->old); + + return 0; +} + +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl) +{ + u32 i; + u64 mask; + + if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + mask = 0; + for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++) + mask |= stage2_pte_bits[i].mask; + + for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) { + snprintf(level[i].name, sizeof(level[i].name), "%u", i); + + level[i].num = ARRAY_SIZE(stage2_pte_bits); + level[i].bits = stage2_pte_bits; + level[i].mask = mask; + } + + return 0; +} + +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm) +{ + struct kvm_ptdump_guest_state *st; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct kvm_pgtable *pgtable = mmu->pgt; + int ret; + + st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT); + if (!st) + return ERR_PTR(-ENOMEM); + + ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level); + if (ret) { + kfree(st); + return ERR_PTR(ret); + } + + st->ipa_marker[0].name = "Guest IPA"; + st->ipa_marker[1].start_address = BIT(pgtable->ia_bits); + st->range[0].end = BIT(pgtable->ia_bits); + + st->kvm = kvm; + st->parser_state = (struct ptdump_pg_state) { + .marker = &st->ipa_marker[0], + .level = -1, + .pg_level = &st->level[0], + .ptdump.range = &st->range[0], + .start_address = 0, + }; + + return st; +} + +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused) +{ + int ret; + struct kvm_ptdump_guest_state *st = m->private; + struct kvm *kvm = st->kvm; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct ptdump_pg_state *parser_state = &st->parser_state; + struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) { + .cb = kvm_ptdump_visitor, + .arg = parser_state, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + parser_state->seq = m; + + write_lock(&kvm->mmu_lock); + ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker); + write_unlock(&kvm->mmu_lock); + + return ret; +} + +static int kvm_ptdump_guest_open(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + struct kvm_ptdump_guest_state *st; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + st = kvm_ptdump_parser_create(kvm); + if (IS_ERR(st)) { + ret = PTR_ERR(st); + goto err_with_kvm_ref; + } + + ret = single_open(file, kvm_ptdump_guest_show, st); + if (!ret) + return 0; + + kfree(st); +err_with_kvm_ref: + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_ptdump_guest_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + void *st = ((struct seq_file *)file->private_data)->private; + + kfree(st); + kvm_put_kvm(kvm); + + return single_release(m, file); +} + +static const struct file_operations kvm_ptdump_guest_fops = { + .open = kvm_ptdump_guest_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_ptdump_guest_close, +}; + +static int kvm_pgtable_range_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%2u\n", pgtable->ia_bits); + return 0; +} + +static int kvm_pgtable_levels_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level); + return 0; +} + +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + struct kvm *kvm = m->i_private; + struct kvm_pgtable *pgtable; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + pgtable = kvm->arch.mmu.pgt; + + ret = single_open(file, show, pgtable); + if (ret < 0) + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_pgtable_range_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show); +} + +static int kvm_pgtable_levels_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show); +} + +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + + kvm_put_kvm(kvm); + return single_release(m, file); +} + +static const struct file_operations kvm_pgtable_range_fops = { + .open = kvm_pgtable_range_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +static const struct file_operations kvm_pgtable_levels_fops = { + .open = kvm_pgtable_levels_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) +{ + debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry, + kvm, &kvm_ptdump_guest_fops); + debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm, + &kvm_pgtable_range_fops); + debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry, + kvm, &kvm_pgtable_levels_fops); +} diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 78a09f7a6637..4ceabaa4c30b 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -19,7 +19,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) u64 steal = 0; int idx; - if (base == GPA_INVALID) + if (base == INVALID_GPA) return; idx = srcu_read_lock(&kvm->srcu); @@ -40,7 +40,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) switch (feature) { case ARM_SMCCC_HV_PV_TIME_FEATURES: case ARM_SMCCC_HV_PV_TIME_ST: - if (vcpu->arch.steal.base != GPA_INVALID) + if (vcpu->arch.steal.base != INVALID_GPA) val = SMCCC_RET_SUCCESS; break; } @@ -54,7 +54,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; u64 base = vcpu->arch.steal.base; - if (base == GPA_INVALID) + if (base == INVALID_GPA) return base; /* @@ -89,7 +89,7 @@ int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, return -EFAULT; if (!IS_ALIGNED(ipa, 64)) return -EINVAL; - if (vcpu->arch.steal.base != GPA_INVALID) + if (vcpu->arch.steal.base != INVALID_GPA) return -EEXIST; /* Check the address is in a valid memslot */ diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e0267f672b8a..959532422d3a 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -27,10 +27,12 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/virt.h> /* Maximum phys_shift supported for any VM on this host */ -static u32 kvm_ipa_limit; +static u32 __ro_after_init kvm_ipa_limit; +unsigned int __ro_after_init kvm_host_sve_max_vl; /* * ARMv8 Reset Values @@ -38,15 +40,20 @@ static u32 kvm_ipa_limit; #define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \ PSR_F_BIT | PSR_D_BIT) +#define VCPU_RESET_PSTATE_EL2 (PSR_MODE_EL2h | PSR_A_BIT | PSR_I_BIT | \ + PSR_F_BIT | PSR_D_BIT) + #define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ PSR_AA32_I_BIT | PSR_AA32_F_BIT) -unsigned int kvm_sve_max_vl; +unsigned int __ro_after_init kvm_sve_max_vl; -int kvm_arm_init_sve(void) +int __init kvm_arm_init_sve(void) { if (system_supports_sve()) { kvm_sve_max_vl = sve_max_virtualisable_vl(); + kvm_host_sve_max_vl = sve_max_vl(); + kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl; /* * The get_sve_reg()/set_sve_reg() ioctl interface will need @@ -69,11 +76,8 @@ int kvm_arm_init_sve(void) return 0; } -static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) +static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) { - if (!system_supports_sve()) - return -EINVAL; - vcpu->arch.sve_max_vl = kvm_sve_max_vl; /* @@ -81,9 +85,7 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until * kvm_arm_vcpu_finalize(), which freezes the configuration. */ - vcpu_set_flag(vcpu, GUEST_HAS_SVE); - - return 0; + set_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &vcpu->kvm->arch.flags); } /* @@ -152,11 +154,13 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) { void *sve_state = vcpu->arch.sve_state; - kvm_vcpu_unshare_task_fp(vcpu); kvm_unshare_hyp(vcpu, vcpu + 1); if (sve_state) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); kfree(sve_state); + free_page((unsigned long)vcpu->arch.ctxt.vncr_array); + kfree(vcpu->arch.vncr_tlb); + kfree(vcpu->arch.ccsidr); } static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) @@ -165,69 +169,6 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); } -static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) -{ - /* - * For now make sure that both address/generic pointer authentication - * features are requested by the userspace together and the system - * supports these capabilities. - */ - if (!test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || - !test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features) || - !system_has_full_ptr_auth()) - return -EINVAL; - - vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); - return 0; -} - -/** - * kvm_set_vm_width() - set the register width for the guest - * @vcpu: Pointer to the vcpu being configured - * - * Set both KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED - * in the VM flags based on the vcpu's requested register width, the HW - * capabilities and other options (such as MTE). - * When REG_WIDTH_CONFIGURED is already set, the vcpu settings must be - * consistent with the value of the FLAG_EL1_32BIT bit in the flags. - * - * Return: 0 on success, negative error code on failure. - */ -static int kvm_set_vm_width(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - bool is32bit; - - is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); - - lockdep_assert_held(&kvm->lock); - - if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) { - /* - * The guest's register width is already configured. - * Make sure that the vcpu is consistent with it. - */ - if (is32bit == test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags)) - return 0; - - return -EINVAL; - } - - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit) - return -EINVAL; - - /* MTE is incompatible with AArch32 */ - if (kvm_has_mte(kvm) && is32bit) - return -EINVAL; - - if (is32bit) - set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); - - set_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags); - - return 0; -} - /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer @@ -246,26 +187,16 @@ static int kvm_set_vm_width(struct kvm_vcpu *vcpu) * disable preemption around the vcpu reset as we would otherwise race with * preempt notifiers which also call put/load. */ -int kvm_reset_vcpu(struct kvm_vcpu *vcpu) +void kvm_reset_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_reset_state reset_state; - int ret; bool loaded; u32 pstate; - mutex_lock(&vcpu->kvm->lock); - ret = kvm_set_vm_width(vcpu); - if (!ret) { - reset_state = vcpu->arch.reset_state; - WRITE_ONCE(vcpu->arch.reset_state.reset, false); - } - mutex_unlock(&vcpu->kvm->lock); - - if (ret) - return ret; - - /* Reset PMU outside of the non-preemptible section */ - kvm_pmu_vcpu_reset(vcpu); + spin_lock(&vcpu->arch.mp_state_lock); + reset_state = vcpu->arch.reset_state; + vcpu->arch.reset_state.reset = false; + spin_unlock(&vcpu->arch.mp_state_lock); preempt_disable(); loaded = (vcpu->cpu != -1); @@ -273,37 +204,18 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_arch_vcpu_put(vcpu); if (!kvm_arm_vcpu_sve_finalized(vcpu)) { - if (test_bit(KVM_ARM_VCPU_SVE, vcpu->arch.features)) { - ret = kvm_vcpu_enable_sve(vcpu); - if (ret) - goto out; - } + if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) + kvm_vcpu_enable_sve(vcpu); } else { kvm_vcpu_reset_sve(vcpu); } - if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || - test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) { - if (kvm_vcpu_enable_ptrauth(vcpu)) { - ret = -EINVAL; - goto out; - } - } - - switch (vcpu->arch.target) { - default: - if (vcpu_el1_is_32bit(vcpu)) { - pstate = VCPU_RESET_PSTATE_SVC; - } else { - pstate = VCPU_RESET_PSTATE_EL1; - } - - if (kvm_vcpu_has_pmu(vcpu) && !kvm_arm_support_pmu_v3()) { - ret = -EINVAL; - goto out; - } - break; - } + if (vcpu_el1_is_32bit(vcpu)) + pstate = VCPU_RESET_PSTATE_SVC; + else if (vcpu_has_nv(vcpu)) + pstate = VCPU_RESET_PSTATE_EL2; + else + pstate = VCPU_RESET_PSTATE_EL1; /* Reset core registers */ memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); @@ -339,12 +251,17 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } /* Reset timer */ - ret = kvm_timer_vcpu_reset(vcpu); -out: + kvm_timer_vcpu_reset(vcpu); + if (loaded) kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); - return ret; +} + +u32 kvm_get_pa_bits(struct kvm *kvm) +{ + /* Fixed limit until we can configure ID_AA64MMFR0.PARange */ + return kvm_ipa_limit; } u32 get_kvm_ipa_limit(void) @@ -352,7 +269,7 @@ u32 get_kvm_ipa_limit(void) return kvm_ipa_limit; } -int kvm_set_ipa_limit(void) +int __init kvm_set_ipa_limit(void) { unsigned int parange; u64 mmfr0; @@ -361,12 +278,11 @@ int kvm_set_ipa_limit(void) parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); /* - * IPA size beyond 48 bits could not be supported - * on either 4K or 16K page size. Hence let's cap - * it to 48 bits, in case it's reported as larger - * on the system. + * IPA size beyond 48 bits for 4K and 16K page size is only supported + * when LPA2 is available. So if we have LPA2, enable it, else cap to 48 + * bits, in case it's reported as larger on the system. */ - if (PAGE_SIZE != SZ_64K) + if (!kvm_lpa2_is_enabled() && PAGE_SIZE != SZ_64K) parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48); /* diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c index 3ace5b75813b..af5eec681127 100644 --- a/arch/arm64/kvm/stacktrace.c +++ b/arch/arm64/kvm/stacktrace.c @@ -19,6 +19,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> #include <asm/stacktrace/nvhe.h> static struct stack_info stackinfo_get_overflow(void) @@ -50,7 +51,7 @@ static struct stack_info stackinfo_get_hyp(void) struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); unsigned long low = (unsigned long)stacktrace_info->stack_base; - unsigned long high = low + PAGE_SIZE; + unsigned long high = low + NVHE_STACK_SIZE; return (struct stack_info) { .low = low, @@ -60,8 +61,8 @@ static struct stack_info stackinfo_get_hyp(void) static struct stack_info stackinfo_get_hyp_kern_va(void) { - unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); - unsigned long high = low + PAGE_SIZE; + unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_base); + unsigned long high = low + NVHE_STACK_SIZE; return (struct stack_info) { .low = low, @@ -145,7 +146,7 @@ static void unwind(struct unwind_state *state, */ static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) { - unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); + unsigned long va_mask = GENMASK_ULL(__hyp_va_bits - 1, 0); unsigned long hyp_offset = (unsigned long)arg; /* Mask tags and convert to kern addr */ diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c6cbfe6b854b..c8fd7c6a12a1 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -11,11 +11,15 @@ #include <linux/bitfield.h> #include <linux/bsearch.h> +#include <linux/cacheinfo.h> +#include <linux/debugfs.h> #include <linux/kvm_host.h> #include <linux/mm.h> #include <linux/printk.h> #include <linux/uaccess.h> +#include <linux/irqchip/arm-gic-v3.h> +#include <asm/arm_pmuv3.h> #include <asm/cacheflush.h> #include <asm/cputype.h> #include <asm/debug-monitors.h> @@ -24,12 +28,14 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/perf_event.h> #include <asm/sysreg.h> #include <trace/events/kvm.h> #include "sys_regs.h" +#include "vgic/vgic.h" #include "trace.h" @@ -40,66 +46,454 @@ */ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); +static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val); + +static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return false; +} + +static bool bad_trap(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r, + const char *msg) +{ + WARN_ONCE(1, "Unexpected %s\n", msg); + print_sys_reg_instr(params); + return undef_access(vcpu, params, r); +} static bool read_from_write_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { - WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n"); - print_sys_reg_instr(params); - kvm_inject_undefined(vcpu); - return false; + return bad_trap(vcpu, params, r, + "sys_reg read to write-only register"); } static bool write_to_read_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { - WARN_ONCE(1, "Unexpected sys_reg write to read-only register\n"); - print_sys_reg_instr(params); - kvm_inject_undefined(vcpu); - return false; + return bad_trap(vcpu, params, r, + "sys_reg write to read-only register"); } -u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) +enum sr_loc_attr { + SR_LOC_MEMORY = 0, /* Register definitely in memory */ + SR_LOC_LOADED = BIT(0), /* Register on CPU, unless it cannot */ + SR_LOC_MAPPED = BIT(1), /* Register in a different CPU register */ + SR_LOC_XLATED = BIT(2), /* Register translated to fit another reg */ + SR_LOC_SPECIAL = BIT(3), /* Demanding register, implies loaded */ +}; + +struct sr_loc { + enum sr_loc_attr loc; + enum vcpu_sysreg map_reg; + u64 (*xlate)(u64); +}; + +static enum sr_loc_attr locate_direct_register(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) { - u64 val = 0x8badf00d8badf00d; + switch (reg) { + case SCTLR_EL1: + case CPACR_EL1: + case TTBR0_EL1: + case TTBR1_EL1: + case TCR_EL1: + case TCR2_EL1: + case PIR_EL1: + case PIRE0_EL1: + case POR_EL1: + case ESR_EL1: + case AFSR0_EL1: + case AFSR1_EL1: + case FAR_EL1: + case MAIR_EL1: + case VBAR_EL1: + case CONTEXTIDR_EL1: + case AMAIR_EL1: + case CNTKCTL_EL1: + case ELR_EL1: + case SPSR_EL1: + case ZCR_EL1: + case SCTLR2_EL1: + /* + * EL1 registers which have an ELx2 mapping are loaded if + * we're not in hypervisor context. + */ + return is_hyp_ctxt(vcpu) ? SR_LOC_MEMORY : SR_LOC_LOADED; + + case TPIDR_EL0: + case TPIDRRO_EL0: + case TPIDR_EL1: + case PAR_EL1: + case DACR32_EL2: + case IFSR32_EL2: + case DBGVCR32_EL2: + /* These registers are always loaded, no matter what */ + return SR_LOC_LOADED; - if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) && - __vcpu_read_sys_reg_from_cpu(reg, &val)) - return val; + default: + /* Non-mapped EL2 registers are by definition in memory. */ + return SR_LOC_MEMORY; + } +} - return __vcpu_sys_reg(vcpu, reg); +static void locate_mapped_el2_register(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg, + enum vcpu_sysreg map_reg, + u64 (*xlate)(u64), + struct sr_loc *loc) +{ + if (!is_hyp_ctxt(vcpu)) { + loc->loc = SR_LOC_MEMORY; + return; + } + + loc->loc = SR_LOC_LOADED | SR_LOC_MAPPED; + loc->map_reg = map_reg; + + WARN_ON(locate_direct_register(vcpu, map_reg) != SR_LOC_MEMORY); + + if (xlate != NULL && !vcpu_el2_e2h_is_set(vcpu)) { + loc->loc |= SR_LOC_XLATED; + loc->xlate = xlate; + } } -void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) +#define MAPPED_EL2_SYSREG(r, m, t) \ + case r: { \ + locate_mapped_el2_register(vcpu, r, m, t, loc); \ + break; \ + } + +static void locate_register(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, + struct sr_loc *loc) { - if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) && - __vcpu_write_sys_reg_to_cpu(val, reg)) + if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) { + loc->loc = SR_LOC_MEMORY; return; + } + + switch (reg) { + MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, + translate_sctlr_el2_to_sctlr_el1 ); + MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1, + translate_cptr_el2_to_cpacr_el1 ); + MAPPED_EL2_SYSREG(TTBR0_EL2, TTBR0_EL1, + translate_ttbr0_el2_to_ttbr0_el1 ); + MAPPED_EL2_SYSREG(TTBR1_EL2, TTBR1_EL1, NULL ); + MAPPED_EL2_SYSREG(TCR_EL2, TCR_EL1, + translate_tcr_el2_to_tcr_el1 ); + MAPPED_EL2_SYSREG(VBAR_EL2, VBAR_EL1, NULL ); + MAPPED_EL2_SYSREG(AFSR0_EL2, AFSR0_EL1, NULL ); + MAPPED_EL2_SYSREG(AFSR1_EL2, AFSR1_EL1, NULL ); + MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL ); + MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); + MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); + MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL ); + MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL ); + MAPPED_EL2_SYSREG(POR_EL2, POR_EL1, NULL ); + MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); + MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); + MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); + MAPPED_EL2_SYSREG(SCTLR2_EL2, SCTLR2_EL1, NULL ); + case CNTHCTL_EL2: + /* CNTHCTL_EL2 is super special, until we support NV2.1 */ + loc->loc = ((is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) ? + SR_LOC_SPECIAL : SR_LOC_MEMORY); + break; + default: + loc->loc = locate_direct_register(vcpu, reg); + } +} + +static u64 read_sr_from_cpu(enum vcpu_sysreg reg) +{ + u64 val = 0x8badf00d8badf00d; + + switch (reg) { + case SCTLR_EL1: val = read_sysreg_s(SYS_SCTLR_EL12); break; + case CPACR_EL1: val = read_sysreg_s(SYS_CPACR_EL12); break; + case TTBR0_EL1: val = read_sysreg_s(SYS_TTBR0_EL12); break; + case TTBR1_EL1: val = read_sysreg_s(SYS_TTBR1_EL12); break; + case TCR_EL1: val = read_sysreg_s(SYS_TCR_EL12); break; + case TCR2_EL1: val = read_sysreg_s(SYS_TCR2_EL12); break; + case PIR_EL1: val = read_sysreg_s(SYS_PIR_EL12); break; + case PIRE0_EL1: val = read_sysreg_s(SYS_PIRE0_EL12); break; + case POR_EL1: val = read_sysreg_s(SYS_POR_EL12); break; + case ESR_EL1: val = read_sysreg_s(SYS_ESR_EL12); break; + case AFSR0_EL1: val = read_sysreg_s(SYS_AFSR0_EL12); break; + case AFSR1_EL1: val = read_sysreg_s(SYS_AFSR1_EL12); break; + case FAR_EL1: val = read_sysreg_s(SYS_FAR_EL12); break; + case MAIR_EL1: val = read_sysreg_s(SYS_MAIR_EL12); break; + case VBAR_EL1: val = read_sysreg_s(SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break; + case AMAIR_EL1: val = read_sysreg_s(SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: val = read_sysreg_s(SYS_CNTKCTL_EL12); break; + case ELR_EL1: val = read_sysreg_s(SYS_ELR_EL12); break; + case SPSR_EL1: val = read_sysreg_s(SYS_SPSR_EL12); break; + case ZCR_EL1: val = read_sysreg_s(SYS_ZCR_EL12); break; + case SCTLR2_EL1: val = read_sysreg_s(SYS_SCTLR2_EL12); break; + case TPIDR_EL0: val = read_sysreg_s(SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: val = read_sysreg_s(SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: val = read_sysreg_s(SYS_TPIDR_EL1); break; + case PAR_EL1: val = read_sysreg_par(); break; + case DACR32_EL2: val = read_sysreg_s(SYS_DACR32_EL2); break; + case IFSR32_EL2: val = read_sysreg_s(SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: val = read_sysreg_s(SYS_DBGVCR32_EL2); break; + default: WARN_ON_ONCE(1); + } + + return val; +} - __vcpu_sys_reg(vcpu, reg) = val; +static void write_sr_to_cpu(enum vcpu_sysreg reg, u64 val) +{ + switch (reg) { + case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; + case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; + case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; + case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; + case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; + case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break; + case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break; + case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break; + case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break; + case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; + case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; + case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; + case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break; + case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break; + case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break; + case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; + case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break; + case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break; + case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break; + case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break; + case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break; + case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; + case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; + case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; + default: WARN_ON_ONCE(1); + } } -/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ -static u32 cache_levels; +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg) +{ + struct sr_loc loc = {}; + + locate_register(vcpu, reg, &loc); + + WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY); + + if (loc.loc & SR_LOC_SPECIAL) { + u64 val; + + WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL); + + /* + * CNTHCTL_EL2 requires some special treatment to account + * for the bits that can be set via CNTKCTL_EL1 when E2H==1. + */ + switch (reg) { + case CNTHCTL_EL2: + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; + return val; + default: + WARN_ON_ONCE(1); + } + } + + if (loc.loc & SR_LOC_LOADED) { + enum vcpu_sysreg map_reg = reg; + + if (loc.loc & SR_LOC_MAPPED) + map_reg = loc.map_reg; + + if (!(loc.loc & SR_LOC_XLATED)) { + u64 val = read_sr_from_cpu(map_reg); + + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); + + return val; + } + } + + return __vcpu_sys_reg(vcpu, reg); +} + +void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, enum vcpu_sysreg reg) +{ + struct sr_loc loc = {}; + + locate_register(vcpu, reg, &loc); + + WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY); + + if (loc.loc & SR_LOC_SPECIAL) { + + WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL); + + switch (reg) { + case CNTHCTL_EL2: + /* + * If E2H=1, some of the bits are backed by + * CNTKCTL_EL1, while the rest is kept in memory. + * Yes, this is fun stuff. + */ + write_sysreg_el1(val, SYS_CNTKCTL); + break; + default: + WARN_ON_ONCE(1); + } + } + + if (loc.loc & SR_LOC_LOADED) { + enum vcpu_sysreg map_reg = reg; + u64 xlated_val; + + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); + + if (loc.loc & SR_LOC_MAPPED) + map_reg = loc.map_reg; + + if (loc.loc & SR_LOC_XLATED) + xlated_val = loc.xlate(val); + else + xlated_val = val; + + write_sr_to_cpu(map_reg, xlated_val); + + /* + * Fall through to write the backing store anyway, which + * allows translated registers to be directly read without a + * reverse translation. + */ + } + + __vcpu_assign_sys_reg(vcpu, reg, val); +} /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ #define CSSELR_MAX 14 +/* + * Returns the minimum line size for the selected cache, expressed as + * Log2(bytes). + */ +static u8 get_min_cache_line_size(bool icache) +{ + u64 ctr = read_sanitised_ftr_reg(SYS_CTR_EL0); + u8 field; + + if (icache) + field = SYS_FIELD_GET(CTR_EL0, IminLine, ctr); + else + field = SYS_FIELD_GET(CTR_EL0, DminLine, ctr); + + /* + * Cache line size is represented as Log2(words) in CTR_EL0. + * Log2(bytes) can be derived with the following: + * + * Log2(words) + 2 = Log2(bytes / 4) + 2 + * = Log2(bytes) - 2 + 2 + * = Log2(bytes) + */ + return field + 2; +} + /* Which cache CCSIDR represents depends on CSSELR value. */ -static u32 get_ccsidr(u32 csselr) +static u32 get_ccsidr(struct kvm_vcpu *vcpu, u32 csselr) { - u32 ccsidr; + u8 line_size; + + if (vcpu->arch.ccsidr) + return vcpu->arch.ccsidr[csselr]; + + line_size = get_min_cache_line_size(csselr & CSSELR_EL1_InD); + + /* + * Fabricate a CCSIDR value as the overriding value does not exist. + * The real CCSIDR value will not be used as it can vary by the + * physical CPU which the vcpu currently resides in. + * + * The line size is determined with get_min_cache_line_size(), which + * should be valid for all CPUs even if they have different cache + * configuration. + * + * The associativity bits are cleared, meaning the geometry of all data + * and unified caches (which are guaranteed to be PIPT and thus + * non-aliasing) are 1 set and 1 way. + * Guests should not be doing cache operations by set/way at all, and + * for this reason, we trap them and attempt to infer the intent, so + * that we can flush the entire guest's address space at the appropriate + * time. The exposed geometry minimizes the number of the traps. + * [If guests should attempt to infer aliasing properties from the + * geometry (which is not permitted by the architecture), they would + * only do so for virtually indexed caches.] + * + * We don't check if the cache level exists as it is allowed to return + * an UNKNOWN value if not. + */ + return SYS_FIELD_PREP(CCSIDR_EL1, LineSize, line_size - 4); +} + +static int set_ccsidr(struct kvm_vcpu *vcpu, u32 csselr, u32 val) +{ + u8 line_size = FIELD_GET(CCSIDR_EL1_LineSize, val) + 4; + u32 *ccsidr = vcpu->arch.ccsidr; + u32 i; + + if ((val & CCSIDR_EL1_RES0) || + line_size < get_min_cache_line_size(csselr & CSSELR_EL1_InD)) + return -EINVAL; + + if (!ccsidr) { + if (val == get_ccsidr(vcpu, csselr)) + return 0; + + ccsidr = kmalloc_array(CSSELR_MAX, sizeof(u32), GFP_KERNEL_ACCOUNT); + if (!ccsidr) + return -ENOMEM; + + for (i = 0; i < CSSELR_MAX; i++) + ccsidr[i] = get_ccsidr(vcpu, i); + + vcpu->arch.ccsidr = ccsidr; + } - /* Make sure noone else changes CSSELR during this! */ - local_irq_disable(); - write_sysreg(csselr, csselr_el1); - isb(); - ccsidr = read_sysreg(ccsidr_el1); - local_irq_enable(); + ccsidr[csselr] = val; + + return 0; +} + +static bool access_rw(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, r->reg); + else + p->regval = vcpu_read_sys_reg(vcpu, r->reg); - return ccsidr; + return true; } /* @@ -119,12 +513,23 @@ static bool access_dcsw(struct kvm_vcpu *vcpu, * CPU left in the system, and certainly not from non-secure * software). */ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) kvm_set_way_flush(vcpu); return true; } +static bool access_dcgsw(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_mte(vcpu->kvm)) + return undef_access(vcpu, p, r); + + /* Treat MTE S/W ops as we treat the classic ones: with contempt */ + return access_dcsw(vcpu, p, r); +} + static void get_access_mask(const struct sys_reg_desc *r, u64 *mask, u64 *shift) { switch (r->aarch32_map) { @@ -200,6 +605,9 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, { bool g1; + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (!p->is_write) return read_from_write_only(vcpu, p, r); @@ -243,10 +651,33 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (p->is_write) return ignore_write(vcpu, p); - p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; + if (p->Op1 == 4) { /* ICC_SRE_EL2 */ + p->regval = KVM_ICC_SRE_EL2; + } else { /* ICC_SRE_EL1 */ + p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; + } + + return true; +} + +static bool access_gic_dir(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + + if (!p->is_write) + return undef_access(vcpu, p, r); + + vgic_v3_deactivate(vcpu, p->regval); + return true; } @@ -270,13 +701,10 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); u32 sr = reg_to_encoding(r); - if (!(val & (0xfUL << ID_AA64MMFR1_EL1_LO_SHIFT))) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) + return undef_access(vcpu, p, r); if (p->is_write && sr == SYS_LORID_EL1) return write_to_read_only(vcpu, p, r); @@ -288,17 +716,10 @@ static bool trap_oslar_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 oslsr; - if (!p->is_write) return read_from_write_only(vcpu, p, r); - /* Forward the OSLK bit to OSLSR */ - oslsr = __vcpu_sys_reg(vcpu, OSLSR_EL1) & ~SYS_OSLSR_OSLK; - if (p->regval & SYS_OSLAR_OSLK) - oslsr |= SYS_OSLSR_OSLK; - - __vcpu_sys_reg(vcpu, OSLSR_EL1) = oslsr; + kvm_debug_handle_oslar(vcpu, p->regval); return true; } @@ -320,10 +741,10 @@ static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, * The only modifiable bit is the OSLK bit. Refuse the write if * userspace attempts to change any other bit in the register. */ - if ((val ^ rd->val) & ~SYS_OSLSR_OSLK) + if ((val ^ rd->val) & ~OSLSR_EL1_OSLK) return -EINVAL; - __vcpu_sys_reg(vcpu, rd->reg) = val; + __vcpu_assign_sys_reg(vcpu, rd->reg, val); return 0; } @@ -339,46 +760,13 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, } } -/* - * We want to avoid world-switching all the DBG registers all the - * time: - * - * - If we've touched any debug register, it is likely that we're - * going to touch more of them. It then makes sense to disable the - * traps and start doing the save/restore dance - * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is - * then mandatory to save/restore the registers, as the guest - * depends on them. - * - * For this, we use a DIRTY bit, indicating the guest has modified the - * debug registers, used as follow: - * - * On guest entry: - * - If the dirty bit is set (because we're coming back from trapping), - * disable the traps, save host registers, restore guest registers. - * - If debug is actively in use (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), - * set the dirty bit, disable the traps, save host registers, - * restore guest registers. - * - Otherwise, enable the traps - * - * On guest exit: - * - If the dirty bit is set, save guest registers, restore host - * registers and clear the dirty bit. This ensure that the host can - * now use the debug registers. - */ static bool trap_debug_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (p->is_write) { - vcpu_write_sys_reg(vcpu, p->regval, r->reg); - vcpu_set_flag(vcpu, DEBUG_DIRTY); - } else { - p->regval = vcpu_read_sys_reg(vcpu, r->reg); - } - - trace_trap_reg(__func__, r->reg, p->is_write, p->regval); + access_rw(vcpu, p, r); + kvm_debug_set_guest_ownership(vcpu); return true; } @@ -387,9 +775,6 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, * * A 32 bit write to a debug register leave top bits alone * A 32 bit read from a debug register only returns the bottom bits - * - * All writes will set the DEBUG_DIRTY flag to ensure the hyp code - * switches between host and guest values in future. */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -404,8 +789,6 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu, val &= ~mask; val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; - - vcpu_set_flag(vcpu, DEBUG_DIRTY); } static void dbg_to_reg(struct kvm_vcpu *vcpu, @@ -419,164 +802,97 @@ static void dbg_to_reg(struct kvm_vcpu *vcpu, p->regval = (*dbg_reg & mask) >> shift; } -static bool trap_bvr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) +static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); - - return true; -} - -static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = val; - return 0; -} - -static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - return 0; + struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state; + + switch (rd->Op2) { + case 0b100: + return &dbg->dbg_bvr[rd->CRm]; + case 0b101: + return &dbg->dbg_bcr[rd->CRm]; + case 0b110: + return &dbg->dbg_wvr[rd->CRm]; + case 0b111: + return &dbg->dbg_wcr[rd->CRm]; + default: + KVM_BUG_ON(1, vcpu->kvm); + return NULL; + } } -static void reset_bvr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; -} + u64 *reg = demux_wb_reg(vcpu, rd); -static bool trap_bcr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; + if (!reg) + return false; if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); + reg_to_dbg(vcpu, p, rd, reg); else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); + dbg_to_reg(vcpu, p, rd, reg); + kvm_debug_set_guest_ownership(vcpu); return true; } -static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = val; - return 0; -} - -static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - return 0; -} - -static void reset_bcr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; -} - -static bool trap_wvr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) +static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]); + u64 *reg = demux_wb_reg(vcpu, rd); - return true; -} - -static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = val; - return 0; -} + if (!reg) + return -EINVAL; -static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; + *reg = val; return 0; } -static void reset_wvr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; -} - -static bool trap_wcr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) +static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; + u64 *reg = demux_wb_reg(vcpu, rd); - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); - - return true; -} + if (!reg) + return -EINVAL; -static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = val; + *val = *reg; return 0; } -static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) +static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - *val = vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - return 0; -} + u64 *reg = demux_wb_reg(vcpu, rd); -static void reset_wcr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; + /* + * Bail early if we couldn't find storage for the register, the + * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever + * being run. + */ + if (!reg) + return 0; + + *reg = rd->val; + return rd->val; } -static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 amair = read_sysreg(amair_el1); vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1); + return amair; } -static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 actlr = read_sysreg(actlr_el1); vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1); + return actlr; } -static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 mpidr; @@ -590,7 +906,16 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); - vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1); + mpidr |= (1ULL << 31); + vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1); + + return mpidr; +} + +static unsigned int hidden_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return REG_HIDDEN; } static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, @@ -602,55 +927,62 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } -static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); - - /* No PMU available, any PMU reg may UNDEF... */ - if (!kvm_arm_support_pmu_v3()) - return; + u64 mask = BIT(ARMV8_PMU_CYCLE_IDX); + u8 n = vcpu->kvm->arch.nr_pmu_counters; - n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; - n &= ARMV8_PMU_PMCR_N_MASK; if (n) mask |= GENMASK(n - 1, 0); reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= mask; + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, mask); + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, GENMASK(31, 0)); + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { + /* This thing will UNDEF, who cares about the reset value? */ + if (!kvm_vcpu_has_pmu(vcpu)) + return 0; + reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK; + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, kvm_pmu_evtyper_mask(vcpu->kvm)); + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, PMSELR_EL0_SEL_MASK); + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - u64 pmcr; - - /* No PMU available, PMCR_EL0 may UNDEF... */ - if (!kvm_arm_support_pmu_v3()) - return; + u64 pmcr = 0; - /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ - pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT); if (!kvm_supports_32bit_el0()) pmcr |= ARMV8_PMU_PMCR_LC; - __vcpu_sys_reg(vcpu, r->reg) = pmcr; + /* + * The value of PMCR.N field is included when the + * vCPU register is read via kvm_vcpu_read_pmcr(). + */ + __vcpu_assign_sys_reg(vcpu, r->reg, pmcr); + + return __vcpu_sys_reg(vcpu, r->reg); } static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) @@ -697,16 +1029,15 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * Only update writeable bits of PMCR (continuing into * kvm_pmu_handle_pmcr() as well) */ - val = __vcpu_sys_reg(vcpu, PMCR_EL0); + val = kvm_vcpu_read_pmcr(vcpu); val &= ~ARMV8_PMU_PMCR_MASK; val |= p->regval & ARMV8_PMU_PMCR_MASK; if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; kvm_pmu_handle_pmcr(vcpu, val); - kvm_vcpu_pmu_restore_guest(vcpu); } else { /* PMCR.P & PMCR.C are RAZ */ - val = __vcpu_sys_reg(vcpu, PMCR_EL0) + val = kvm_vcpu_read_pmcr(vcpu) & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C); p->regval = val; } @@ -721,11 +1052,11 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return false; if (p->is_write) - __vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval; + __vcpu_assign_sys_reg(vcpu, PMSELR_EL0, p->regval); else /* return PMSELR.SEL field */ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + & PMSELR_EL0_SEL_MASK; return true; } @@ -755,8 +1086,8 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) { u64 pmcr, val; - pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); - val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; + pmcr = kvm_vcpu_read_pmcr(vcpu); + val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr); if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { kvm_inject_undefined(vcpu); return false; @@ -765,6 +1096,38 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) return true; } +static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + *val = kvm_pmu_get_counter_value(vcpu, idx); + return 0; +} + +static int set_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + kvm_pmu_set_counter_value_user(vcpu, idx, val); + return 0; +} + static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -777,8 +1140,8 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, if (pmu_access_event_counter_el0_disabled(vcpu)) return false; - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); } else if (r->Op2 == 0) { /* PMCCNTR_EL0 */ if (pmu_access_cycle_counter_el0_disabled(vcpu)) @@ -828,7 +1191,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { /* PMXEVTYPER_EL0 */ - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); reg = PMEVTYPER0_EL0 + idx; } else if (r->CRn == 14 && (r->CRm & 12) == 12) { idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); @@ -846,15 +1209,32 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) { kvm_pmu_set_counter_event_type(vcpu, p->regval, idx); - __vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK; kvm_vcpu_pmu_restore_guest(vcpu); } else { - p->regval = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK; + p->regval = __vcpu_sys_reg(vcpu, reg); } return true; } +static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) +{ + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + + __vcpu_assign_sys_reg(vcpu, r->reg, val & mask); + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + return 0; +} + +static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) +{ + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + + *val = __vcpu_sys_reg(vcpu, r->reg) & mask; + return 0; +} + static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -863,19 +1243,17 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; - if (r->Op2 & 0x1) { + if (r->Op2 & 0x1) /* accessing PMCNTENSET_EL0 */ - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; - kvm_pmu_enable_counter_mask(vcpu, val); - kvm_vcpu_pmu_restore_guest(vcpu); - } else { + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, |=, val); + else /* accessing PMCNTENCLR_EL0 */ - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; - kvm_pmu_disable_counter_mask(vcpu, val); - } + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, ~val); + + kvm_pmu_reprogram_counter_mask(vcpu, val); } else { p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } @@ -886,7 +1264,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (check_pmu_access_disabled(vcpu, 0)) return false; @@ -896,10 +1274,10 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (r->Op2 & 0x1) /* accessing PMINTENSET_EL1 */ - __vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val; + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, |=, val); else /* accessing PMINTENCLR_EL1 */ - __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, ~val); } else { p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } @@ -910,7 +1288,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (pmu_access_el0_disabled(vcpu)) return false; @@ -918,10 +1296,10 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) { if (r->CRm & 0x2) /* accessing PMOVSSET_EL0 */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, (p->regval & mask)); else /* accessing PMOVSCLR_EL0 */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, ~(p->regval & mask)); } else { p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); } @@ -940,7 +1318,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_write_swinc_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); kvm_pmu_software_increment(vcpu, p->regval & mask); return true; } @@ -949,13 +1327,11 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { - if (!vcpu_mode_priv(vcpu)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!vcpu_mode_priv(vcpu)) + return undef_access(vcpu, p, r); - __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = - p->regval & ARMV8_PMU_USERENR_MASK; + __vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0, + (p->regval & ARMV8_PMU_USERENR_MASK)); } else { p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0) & ARMV8_PMU_USERENR_MASK; @@ -964,40 +1340,86 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } +static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = kvm_vcpu_read_pmcr(vcpu); + return 0; +} + +static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val); + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->arch.config_lock); + + /* + * The vCPU can't have more counters than the PMU hardware + * implements. Ignore this error to maintain compatibility + * with the existing KVM behavior. + */ + if (!kvm_vm_has_ran_once(kvm) && + !vcpu_has_nv(vcpu) && + new_n <= kvm_arm_pmu_get_max_counters(kvm)) + kvm->arch.nr_pmu_counters = new_n; + + mutex_unlock(&kvm->arch.config_lock); + + /* + * Ignore writes to RES0 bits, read only bits that are cleared on + * vCPU reset, and writable bits that KVM doesn't support yet. + * (i.e. only PMCR.N and bits [7:0] are mutable from userspace) + * The LP bit is RES0 when FEAT_PMUv3p5 is not supported on the vCPU. + * But, we leave the bit as it is here, as the vCPU's PMUver might + * be changed later (NOTE: the bit will be cleared on first vCPU run + * if necessary). + */ + val &= ARMV8_PMU_PMCR_MASK; + + /* The LC bit is RES1 when AArch32 is not supported */ + if (!kvm_supports_32bit_el0()) + val |= ARMV8_PMU_PMCR_LC; + + __vcpu_assign_sys_reg(vcpu, r->reg, val); + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + return 0; +} + /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ - trap_bvr, reset_bvr, 0, 0, get_bvr, set_bvr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGBCRn_EL1(n)), \ - trap_bcr, reset_bcr, 0, 0, get_bcr, set_bcr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWVRn_EL1(n)), \ - trap_wvr, reset_wvr, 0, 0, get_wvr, set_wvr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWCRn_EL1(n)), \ - trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr } + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg } -#define PMU_SYS_REG(r) \ - SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility +#define PMU_SYS_REG(name) \ + SYS_DESC(SYS_##name), .reset = reset_pmu_reg, \ + .visibility = pmu_visibility /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ - { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ - .reset = reset_pmevcntr, \ + { PMU_SYS_REG(PMEVCNTRn_EL0(n)), \ + .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ + .set_user = set_pmu_evcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ #define PMU_PMEVTYPER_EL0(n) \ - { PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)), \ + { PMU_SYS_REG(PMEVTYPERn_EL0(n)), \ .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } -static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - - return false; -} - /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access } #define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access } @@ -1034,22 +1456,149 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, switch (reg) { case SYS_CNTP_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTV_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_AARCH32_CNTP_TVAL: + case SYS_CNTP_TVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_TVAL; break; + + case SYS_CNTV_TVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHP_TVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHV_TVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_CNTP_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTV_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_AARCH32_CNTP_CTL: + case SYS_CNTP_CTL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CTL; break; + + case SYS_CNTV_CTL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHP_CTL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHV_CTL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_CNTP_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTV_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + case SYS_AARCH32_CNTP_CVAL: + case SYS_CNTP_CVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; + + case SYS_CNTV_CVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHP_CVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHV_CVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CNT; + break; + + case SYS_AARCH32_CNTPCT: + case SYS_AARCH32_CNTPCTSS: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CNT; + break; + + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + + case SYS_AARCH32_CNTVCT: + case SYS_AARCH32_CNTVCTSS: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + default: - BUG(); + print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); + return undef_access(vcpu, p, r); } if (p->is_write) @@ -1060,25 +1609,138 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } -static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu) +static int arch_timer_set_user(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) { - if (kvm_vcpu_has_pmu(vcpu)) - return vcpu->kvm->arch.dfr0_pmuver.imp; + switch (reg_to_encoding(rd)) { + case SYS_CNTV_CTL_EL0: + case SYS_CNTP_CTL_EL0: + case SYS_CNTHV_CTL_EL2: + case SYS_CNTHP_CTL_EL2: + val &= ~ARCH_TIMER_CTRL_IT_STAT; + break; + case SYS_CNTVCT_EL0: + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) + timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read() - val); + return 0; + case SYS_CNTPCT_EL0: + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) + timer_set_offset(vcpu_ptimer(vcpu), kvm_phys_timer_read() - val); + return 0; + } - return vcpu->kvm->arch.dfr0_pmuver.unimp; + __vcpu_assign_sys_reg(vcpu, rd->reg, val); + return 0; } -static u8 perfmon_to_pmuver(u8 perfmon) +static int arch_timer_get_user(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 *val) { - switch (perfmon) { - case ID_DFR0_EL1_PerfMon_PMUv3: - return ID_AA64DFR0_EL1_PMUVer_IMP; - case ID_DFR0_EL1_PerfMon_IMPDEF: - return ID_AA64DFR0_EL1_PMUVer_IMP_DEF; + switch (reg_to_encoding(rd)) { + case SYS_CNTVCT_EL0: + *val = kvm_phys_timer_read() - timer_get_offset(vcpu_vtimer(vcpu)); + break; + case SYS_CNTPCT_EL0: + *val = kvm_phys_timer_read() - timer_get_offset(vcpu_ptimer(vcpu)); + break; default: - /* Anything ARMv8.1+ and NI have the same value. For now. */ - return perfmon; + *val = __vcpu_sys_reg(vcpu, rd->reg); + } + + return 0; +} + +static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, + s64 new, s64 cur) +{ + struct arm64_ftr_bits kvm_ftr = *ftrp; + + /* Some features have different safe value type in KVM than host features */ + switch (id) { + case SYS_ID_AA64DFR0_EL1: + switch (kvm_ftr.shift) { + case ID_AA64DFR0_EL1_PMUVer_SHIFT: + kvm_ftr.type = FTR_LOWER_SAFE; + break; + case ID_AA64DFR0_EL1_DebugVer_SHIFT: + kvm_ftr.type = FTR_LOWER_SAFE; + break; + } + break; + case SYS_ID_DFR0_EL1: + if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT) + kvm_ftr.type = FTR_LOWER_SAFE; + break; + } + + return arm64_ftr_safe_value(&kvm_ftr, new, cur); +} + +/* + * arm64_check_features() - Check if a feature register value constitutes + * a subset of features indicated by the idreg's KVM sanitised limit. + * + * This function will check if each feature field of @val is the "safe" value + * against idreg's KVM sanitised limit return from reset() callback. + * If a field value in @val is the same as the one in limit, it is always + * considered the safe value regardless For register fields that are not in + * writable, only the value in limit is considered the safe value. + * + * Return: 0 if all the fields are safe. Otherwise, return negative errno. + */ +static int arm64_check_features(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + const struct arm64_ftr_reg *ftr_reg; + const struct arm64_ftr_bits *ftrp = NULL; + u32 id = reg_to_encoding(rd); + u64 writable_mask = rd->val; + u64 limit = rd->reset(vcpu, rd); + u64 mask = 0; + + /* + * Hidden and unallocated ID registers may not have a corresponding + * struct arm64_ftr_reg. Of course, if the register is RAZ we know the + * only safe value is 0. + */ + if (sysreg_visible_as_raz(vcpu, rd)) + return val ? -E2BIG : 0; + + ftr_reg = get_arm64_ftr_reg(id); + if (!ftr_reg) + return -EINVAL; + + ftrp = ftr_reg->ftr_bits; + + for (; ftrp && ftrp->width; ftrp++) { + s64 f_val, f_lim, safe_val; + u64 ftr_mask; + + ftr_mask = arm64_ftr_mask(ftrp); + if ((ftr_mask & writable_mask) != ftr_mask) + continue; + + f_val = arm64_ftr_value(ftrp, val); + f_lim = arm64_ftr_value(ftrp, limit); + mask |= ftr_mask; + + if (f_val == f_lim) + safe_val = f_val; + else + safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim); + + if (safe_val != f_val) + return -E2BIG; } + + /* For fields that are not writable, values in limit are the safe values. */ + if ((val & ~mask) != (limit & ~mask)) + return -E2BIG; + + return 0; } static u8 pmuver_to_perfmon(u8 pmuver) @@ -1094,8 +1756,13 @@ static u8 pmuver_to_perfmon(u8 pmuver) } } +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); + /* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) +static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) { u32 id = reg_to_encoding(r); u64 val; @@ -1106,60 +1773,116 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r val = read_sanitised_ftr_reg(id); switch (id) { + case SYS_ID_AA64DFR0_EL1: + val = sanitise_id_aa64dfr0_el1(vcpu, val); + break; case SYS_ID_AA64PFR0_EL1: - if (!vcpu_has_sve(vcpu)) - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); - if (kvm_vgic_global_state.type == VGIC_V3) { - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1); - } + val = sanitise_id_aa64pfr0_el1(vcpu, val); break; case SYS_ID_AA64PFR1_EL1: - if (!kvm_has_mte(vcpu->kvm)) - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); - - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); + val = sanitise_id_aa64pfr1_el1(vcpu, val); + break; + case SYS_ID_AA64PFR2_EL1: + val &= ID_AA64PFR2_EL1_FPMR | + (kvm_has_mte(vcpu->kvm) ? + ID_AA64PFR2_EL1_MTEFAR | ID_AA64PFR2_EL1_MTESTOREONLY : + 0); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) - val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); + val &= ~(ID_AA64ISAR1_EL1_APA | + ID_AA64ISAR1_EL1_API | + ID_AA64ISAR1_EL1_GPA | + ID_AA64ISAR1_EL1_GPI); break; case SYS_ID_AA64ISAR2_EL1: if (!vcpu_has_ptrauth(vcpu)) - val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); - if (!cpus_have_final_cap(ARM64_HAS_WFXT)) - val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); + val &= ~(ID_AA64ISAR2_EL1_APA3 | + ID_AA64ISAR2_EL1_GPA3); + if (!cpus_have_final_cap(ARM64_HAS_WFXT) || + has_broken_cntvoff()) + val &= ~ID_AA64ISAR2_EL1_WFxT; break; - case SYS_ID_AA64DFR0_EL1: - /* Limit debug to ARMv8.0 */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6); - /* Set PMUver to the required version */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), - vcpu_pmuver(vcpu)); - /* Hide SPE from guests */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer); + case SYS_ID_AA64ISAR3_EL1: + val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE | + ID_AA64ISAR3_EL1_FAMINMAX; break; - case SYS_ID_DFR0_EL1: - val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), - pmuver_to_perfmon(vcpu_pmuver(vcpu))); + case SYS_ID_AA64MMFR2_EL1: + val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; + val &= ~ID_AA64MMFR2_EL1_NV; + break; + case SYS_ID_AA64MMFR3_EL1: + val &= ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_SCTLRX | + ID_AA64MMFR3_EL1_S1POE | + ID_AA64MMFR3_EL1_S1PIE; + break; + case SYS_ID_MMFR4_EL1: + val &= ~ID_MMFR4_EL1_CCIDX; break; } + if (vcpu_has_nv(vcpu)) + val = limit_nv_id_reg(vcpu->kvm, id, val); + return val; } +static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return __kvm_read_sanitised_id_reg(vcpu, r); +} + +static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r)); +} + +static bool is_feature_id_reg(u32 encoding) +{ + return (sys_reg_Op0(encoding) == 3 && + (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) && + sys_reg_CRn(encoding) == 0 && + sys_reg_CRm(encoding) <= 7); +} + +/* + * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is + * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID + * registers KVM maintains on a per-VM basis. + * + * Additionally, the implementation ID registers and CTR_EL0 are handled as + * per-VM registers. + */ +static inline bool is_vm_ftr_id_reg(u32 id) +{ + switch (id) { + case SYS_CTR_EL0: + case SYS_MIDR_EL1: + case SYS_REVIDR_EL1: + case SYS_AIDR_EL1: + return true; + default: + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && + sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && + sys_reg_CRm(id) < 8); + + } +} + +static inline bool is_vcpu_ftr_id_reg(u32 id) +{ + return is_feature_id_reg(id) && !is_vm_ftr_id_reg(id); +} + +static inline bool is_aa32_id_reg(u32 id) +{ + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && + sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && + sys_reg_CRm(id) <= 3); +} + static unsigned int id_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { @@ -1205,6 +1928,7 @@ static bool access_id_reg(struct kvm_vcpu *vcpu, return write_to_read_only(vcpu, p, r); p->regval = read_id_reg(vcpu, r); + return true; } @@ -1218,88 +1942,196 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } -static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, - u64 val) +static unsigned int sme_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) { - u8 csv2, csv3; + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_fpmr(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + if (!vcpu_has_sve(vcpu)) + val &= ~ID_AA64PFR0_EL1_SVE_MASK; /* - * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as - * it doesn't promise more than what is actually provided (the - * guest could otherwise be covered in ectoplasmic residue). + * The default is to expose CSV2 == 1 if the HW isn't affected. + * Although this is a per-CPU feature, we make it global because + * asymmetric systems are just a nuisance. + * + * Userspace can override this as long as it doesn't promise + * the impossible. */ - csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT); - if (csv2 > 1 || - (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED)) - return -EINVAL; + if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) { + val &= ~ID_AA64PFR0_EL1_CSV2_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP); + } + if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) { + val &= ~ID_AA64PFR0_EL1_CSV3_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP); + } - /* Same thing for CSV3 */ - csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT); - if (csv3 > 1 || - (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED)) - return -EINVAL; + if (vgic_is_v3(vcpu->kvm)) { + val &= ~ID_AA64PFR0_EL1_GIC_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + } - /* We can only differ with CSV[23], and anything else is an error */ - val ^= read_id_reg(vcpu, rd); - val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); - if (val) - return -EINVAL; + val &= ~ID_AA64PFR0_EL1_AMU_MASK; + + /* + * MPAM is disabled by default as KVM also needs a set of PARTID to + * program the MPAMVPMx_EL2 PARTID remapping registers with. But some + * older kernels let the guest see the ID bit. + */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; - vcpu->kvm->arch.pfr0_csv2 = csv2; - vcpu->kvm->arch.pfr0_csv3 = csv3; + return val; +} - return 0; +static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + if (!kvm_has_mte(vcpu->kvm)) { + val &= ~ID_AA64PFR1_EL1_MTE; + val &= ~ID_AA64PFR1_EL1_MTE_frac; + } + + if (!(cpus_have_final_cap(ARM64_HAS_RASV1P1_EXTN) && + SYS_FIELD_GET(ID_AA64PFR0_EL1, RAS, pfr0) == ID_AA64PFR0_EL1_RAS_IMP)) + val &= ~ID_AA64PFR1_EL1_RAS_frac; + + val &= ~ID_AA64PFR1_EL1_SME; + val &= ~ID_AA64PFR1_EL1_RNDR_trap; + val &= ~ID_AA64PFR1_EL1_NMI; + val &= ~ID_AA64PFR1_EL1_GCS; + val &= ~ID_AA64PFR1_EL1_THE; + val &= ~ID_AA64PFR1_EL1_MTEX; + val &= ~ID_AA64PFR1_EL1_PFAR; + val &= ~ID_AA64PFR1_EL1_MPAM_frac; + + return val; +} + +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); + + /* + * Only initialize the PMU version if the vCPU was configured with one. + */ + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; + if (kvm_vcpu_has_pmu(vcpu)) + val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer, + kvm_arm_pmu_get_pmuver_limit()); + + /* Hide SPE from guests */ + val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; + + /* Hide BRBE from guests */ + val &= ~ID_AA64DFR0_EL1_BRBE_MASK; + + return val; +} + +/* + * Older versions of KVM erroneously claim support for FEAT_DoubleLock with + * NV-enabled VMs on unsupporting hardware. Silently ignore the incorrect + * value if it is consistent with the bug. + */ +static bool ignore_feat_doublelock(struct kvm_vcpu *vcpu, u64 val) +{ + u8 host, user; + + if (!vcpu_has_nv(vcpu)) + return false; + + host = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock, + read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); + user = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock, val); + + return host == ID_AA64DFR0_EL1_DoubleLock_NI && + user == ID_AA64DFR0_EL1_DoubleLock_IMP; } static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { - u8 pmuver, host_pmuver; - bool valid_pmu; + u8 debugver = SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, val); + u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); - host_pmuver = kvm_arm_pmu_get_pmuver_limit(); + /* + * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the + * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously + * exposed an IMP_DEF PMU to userspace and the guest on systems w/ + * non-architectural PMUs. Of course, PMUv3 is the only game in town for + * PMU virtualization, so the IMP_DEF value was rather user-hostile. + * + * At minimum, we're on the hook to allow values that were given to + * userspace by KVM. Cover our tracks here and replace the IMP_DEF value + * with a more sensible NI. The value of an ID register changing under + * the nose of the guest is unfortunate, but is certainly no more + * surprising than an ill-guided PMU driver poking at impdef system + * registers that end in an UNDEF... + */ + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; /* - * Allow AA64DFR0_EL1.PMUver to be set from userspace as long - * as it doesn't promise more than what the HW gives us. We - * allow an IMPDEF PMU though, only if no PMU is supported - * (KVM backward compatibility handling). + * ID_AA64DFR0_EL1.DebugVer is one of those awkward fields with a + * nonzero minimum safe value. */ - pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val); - if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver)) + if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP) return -EINVAL; - valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF); + if (ignore_feat_doublelock(vcpu, val)) { + val &= ~ID_AA64DFR0_EL1_DoubleLock; + val |= SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI); + } - /* Make sure view register and PMU support do match */ - if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) - return -EINVAL; + return set_id_reg(vcpu, rd, val); +} - /* We can only differ with PMUver, and anything else is an error */ - val ^= read_id_reg(vcpu, rd); - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); - if (val) - return -EINVAL; +static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u8 perfmon; + u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1); - if (valid_pmu) - vcpu->kvm->arch.dfr0_pmuver.imp = pmuver; - else - vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver; + val &= ~ID_DFR0_EL1_PerfMon_MASK; + if (kvm_vcpu_has_pmu(vcpu)) { + perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon); + } - return 0; + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8); + + return val; } static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { - u8 perfmon, host_perfmon; - bool valid_pmu; + u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val); + u8 copdbg = SYS_FIELD_GET(ID_DFR0_EL1, CopDbg, val); - host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) { + val &= ~ID_DFR0_EL1_PerfMon_MASK; + perfmon = 0; + } /* * Allow DFR0_EL1.PerfMon to be set from userspace as long as @@ -1307,29 +2139,158 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, * AArch64 side (as everything is emulated with that), and * that this is a PMUv3. */ - perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), val); - if ((perfmon != ID_DFR0_EL1_PerfMon_IMPDEF && perfmon > host_perfmon) || - (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3)) + if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3) return -EINVAL; - valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_EL1_PerfMon_IMPDEF); + if (copdbg < ID_DFR0_EL1_CopDbg_Armv8) + return -EINVAL; + + return set_id_reg(vcpu, rd, val); +} + +static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK; + + /* + * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits + * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to + * guests, but didn't add trap handling. KVM doesn't support MPAM and + * always returns an UNDEF for these registers. The guest must see 0 + * for this field. + * + * But KVM must also accept values from user-space that were provided + * by KVM. On CPUs that support MPAM, permit user-space to write + * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field. + */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; - /* Make sure view register and PMU support do match */ - if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) + /* Fail the guest's request to disable the AA64 ISA at EL{0,1,2} */ + if (!FIELD_GET(ID_AA64PFR0_EL1_EL0, user_val) || + !FIELD_GET(ID_AA64PFR0_EL1_EL1, user_val) || + (vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val))) return -EINVAL; - /* We can only differ with PerfMon, and anything else is an error */ - val ^= read_id_reg(vcpu, rd); - val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon); - if (val) + /* + * If we are running on a GICv5 host and support FEAT_GCIE_LEGACY, then + * we support GICv3. Fail attempts to do anything but set that to IMP. + */ + if (vgic_is_v3_compat(vcpu->kvm) && + FIELD_GET(ID_AA64PFR0_EL1_GIC_MASK, user_val) != ID_AA64PFR0_EL1_GIC_IMP) return -EINVAL; - if (valid_pmu) - vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon); - else - vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon); + return set_id_reg(vcpu, rd, user_val); +} - return 0; +static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; + u8 mte = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE, hw_val); + u8 user_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, user_val); + u8 hw_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, hw_val); + + /* See set_id_aa64pfr0_el1 for comment about MPAM */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + + /* + * Previously MTE_frac was hidden from guest. However, if the + * hardware supports MTE2 but not MTE_ASYM_FAULT then a value + * of 0 for this field indicates that the hardware supports + * MTE_ASYNC. Whereas, 0xf indicates MTE_ASYNC is not supported. + * + * As KVM must accept values from KVM provided by user-space, + * when ID_AA64PFR1_EL1.MTE is 2 allow user-space to set + * ID_AA64PFR1_EL1.MTE_frac to 0. However, ignore it to avoid + * incorrectly claiming hardware support for MTE_ASYNC in the + * guest. + */ + + if (mte == ID_AA64PFR1_EL1_MTE_MTE2 && + hw_mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI && + user_mte_frac == ID_AA64PFR1_EL1_MTE_frac_ASYNC) { + user_val &= ~ID_AA64PFR1_EL1_MTE_frac_MASK; + user_val |= hw_val & ID_AA64PFR1_EL1_MTE_frac_MASK; + } + + return set_id_reg(vcpu, rd, user_val); +} + +/* + * Allow userspace to de-feature a stage-2 translation granule but prevent it + * from claiming the impossible. + */ +#define tgran2_val_allowed(tg, safe, user) \ +({ \ + u8 __s = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, safe); \ + u8 __u = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, user); \ + \ + __s == __u || __u == ID_AA64MMFR0_EL1_##tg##_NI; \ +}) + +static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd); + + if (!vcpu_has_nv(vcpu)) + return set_id_reg(vcpu, rd, user_val); + + if (!tgran2_val_allowed(TGRAN4_2, sanitized_val, user_val) || + !tgran2_val_allowed(TGRAN16_2, sanitized_val, user_val) || + !tgran2_val_allowed(TGRAN64_2, sanitized_val, user_val)) + return -EINVAL; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK; + + /* + * We made the mistake to expose the now deprecated NV field, + * so allow userspace to write it, but silently ignore it. + */ + if ((hw_val & nv_mask) == (user_val & nv_mask)) + user_val &= ~nv_mask; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_ctr_el0(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val); + + /* + * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved. + * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based + * on what hardware reports. + * + * Using a VIPT software model on PIPT will lead to over invalidation, + * but still correct. Hence, we can allow downgrading PIPT to VIPT, + * but not the other way around. This is handled via arm64_ftr_safe_value() + * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value + * set as VIPT. + */ + switch (user_L1Ip) { + case CTR_EL0_L1Ip_RESERVED_VPIPT: + case CTR_EL0_L1Ip_RESERVED_AIVIVT: + return -EINVAL; + case CTR_EL0_L1Ip_VIPT: + case CTR_EL0_L1Ip_PIPT: + return set_id_reg(vcpu, rd, user_val); + default: + return -ENOENT; + } } /* @@ -1342,18 +2303,72 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { + /* + * Avoid locking if the VM has already started, as the ID registers are + * guaranteed to be invariant at that point. + */ + if (kvm_vm_has_ran_once(vcpu->kvm)) { + *val = read_id_reg(vcpu, rd); + return 0; + } + + mutex_lock(&vcpu->kvm->arch.config_lock); *val = read_id_reg(vcpu, rd); + mutex_unlock(&vcpu->kvm->arch.config_lock); + return 0; } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { - /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(vcpu, rd)) - return -EINVAL; + u32 id = reg_to_encoding(rd); + int ret; - return 0; + mutex_lock(&vcpu->kvm->arch.config_lock); + + /* + * Once the VM has started the ID registers are immutable. Reject any + * write that does not match the final register value. + */ + if (kvm_vm_has_ran_once(vcpu->kvm)) { + if (val != read_id_reg(vcpu, rd)) + ret = -EBUSY; + else + ret = 0; + + mutex_unlock(&vcpu->kvm->arch.config_lock); + return ret; + } + + ret = arm64_check_features(vcpu, rd, val); + if (!ret) + kvm_set_vm_id_reg(vcpu->kvm, id, val); + + mutex_unlock(&vcpu->kvm->arch.config_lock); + + /* + * arm64_check_features() returns -E2BIG to indicate the register's + * feature set is a superset of the maximally-allowed register value. + * While it would be nice to precisely describe this to userspace, the + * existing UAPI for KVM_SET_ONE_REG has it that invalid register + * writes return -EINVAL. + */ + if (ret == -E2BIG) + ret = -EINVAL; + return ret; +} + +void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val) +{ + u64 *p = __vm_id_reg(&kvm->arch, reg); + + lockdep_assert_held(&kvm->arch.config_lock); + + if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm)) + return; + + *p = val; } static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, @@ -1375,7 +2390,7 @@ static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0); + p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0); return true; } @@ -1385,10 +2400,80 @@ static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_sysreg(clidr_el1); + p->regval = __vcpu_sys_reg(vcpu, r->reg); return true; } +/* + * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary + * by the physical CPU which the vcpu currently resides in. + */ +static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); + u64 clidr; + u8 loc; + + if ((ctr_el0 & CTR_EL0_IDC)) { + /* + * Data cache clean to the PoU is not required so LoUU and LoUIS + * will not be set and a unified cache, which will be marked as + * LoC, will be added. + * + * If not DIC, let the unified cache L2 so that an instruction + * cache can be added as L1 later. + */ + loc = (ctr_el0 & CTR_EL0_DIC) ? 1 : 2; + clidr = CACHE_TYPE_UNIFIED << CLIDR_CTYPE_SHIFT(loc); + } else { + /* + * Data cache clean to the PoU is required so let L1 have a data + * cache and mark it as LoUU and LoUIS. As L1 has a data cache, + * it can be marked as LoC too. + */ + loc = 1; + clidr = 1 << CLIDR_LOUU_SHIFT; + clidr |= 1 << CLIDR_LOUIS_SHIFT; + clidr |= CACHE_TYPE_DATA << CLIDR_CTYPE_SHIFT(1); + } + + /* + * Instruction cache invalidation to the PoU is required so let L1 have + * an instruction cache. If L1 already has a data cache, it will be + * CACHE_TYPE_SEPARATE. + */ + if (!(ctr_el0 & CTR_EL0_DIC)) + clidr |= CACHE_TYPE_INST << CLIDR_CTYPE_SHIFT(1); + + clidr |= loc << CLIDR_LOC_SHIFT; + + /* + * Add tag cache unified to data cache. Allocation tags and data are + * unified in a cache line so that it looks valid even if there is only + * one cache line. + */ + if (kvm_has_mte(vcpu->kvm)) + clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc); + + __vcpu_assign_sys_reg(vcpu, r->reg, clidr); + + return __vcpu_sys_reg(vcpu, r->reg); +} + +static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) +{ + u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); + u64 idc = !CLIDR_LOC(val) || (!CLIDR_LOUIS(val) && !CLIDR_LOUU(val)); + + if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc)) + return -EINVAL; + + __vcpu_assign_sys_reg(vcpu, rd->reg, val); + + return 0; +} + static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -1410,22 +2495,10 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return write_to_read_only(vcpu, p, r); csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); - p->regval = get_ccsidr(csselr); + csselr &= CSSELR_EL1_Level | CSSELR_EL1_InD; + if (csselr < CSSELR_MAX) + p->regval = get_ccsidr(vcpu, csselr); - /* - * Guests should not be doing cache operations by set/way at all, and - * for this reason, we trap them and attempt to infer the intent, so - * that we can flush the entire guest's address space at the appropriate - * time. - * To prevent this trapping from causing performance problems, let's - * expose the geometry of all data and unified caches (which are - * guaranteed to be PIPT and thus non-aliasing) as 1 set and 1 way. - * [If guests should attempt to infer aliasing properties from the - * geometry (which is not permitted by the architecture), they would - * only do so for virtually indexed caches.] - */ - if (!(csselr & 1)) // data or unified cache - p->regval &= ~GENMASK(27, 3); return true; } @@ -1446,22 +2519,119 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .visibility = mte_visibility, \ } -/* sys_reg_desc initialiser for known cpufeature ID registers */ -#define ID_SANITISED(name) { \ +static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (vcpu_has_nv(vcpu)) + return 0; + + return REG_HIDDEN; +} + +static bool bad_vncr_trap(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* + * We really shouldn't be here, and this is likely the result + * of a misconfigured trap, as this register should target the + * VNCR page, and nothing else. + */ + return bad_trap(vcpu, p, r, + "trap of VNCR-backed register"); +} + +static bool bad_redir_trap(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* + * We really shouldn't be here, and this is likely the result + * of a misconfigured trap, as this register should target the + * corresponding EL1, and nothing else. + */ + return bad_trap(vcpu, p, r, + "trap of EL2 register redirected to EL1"); +} + +#define SYS_REG_USER_FILTER(name, acc, rst, v, gu, su, filter) { \ SYS_DESC(SYS_##name), \ + .access = acc, \ + .reset = rst, \ + .reg = name, \ + .get_user = gu, \ + .set_user = su, \ + .visibility = filter, \ + .val = v, \ +} + +#define EL2_REG_FILTERED(name, acc, rst, v, filter) \ + SYS_REG_USER_FILTER(name, acc, rst, v, NULL, NULL, filter) + +#define EL2_REG(name, acc, rst, v) \ + EL2_REG_FILTERED(name, acc, rst, v, el2_visibility) + +#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) +#define EL2_REG_VNCR_FILT(name, vis) \ + EL2_REG_FILTERED(name, bad_vncr_trap, reset_val, 0, vis) +#define EL2_REG_VNCR_GICv3(name) \ + EL2_REG_VNCR_FILT(name, hidden_visibility) +#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) + +#define TIMER_REG(name, vis) \ + SYS_REG_USER_FILTER(name, access_arch_timer, reset_val, 0, \ + arch_timer_get_user, arch_timer_set_user, vis) + +/* + * Since reset() callback and field val are not used for idregs, they will be + * used for specific purposes for idregs. + * The reset() would return KVM sanitised register value. The value would be the + * same as the host kernel sanitised value if there is no KVM sanitisation. + * The val would be used as a mask indicating writable fields for the idreg. + * Only bits with 1 are writable from userspace. This mask might not be + * necessary in the future whenever all ID registers are enabled as writable + * from userspace. + */ + +#define ID_DESC_DEFAULT_CALLBACKS \ .access = access_id_reg, \ .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = id_visibility, \ -} + .reset = kvm_read_sanitised_id_reg -/* sys_reg_desc initialiser for known cpufeature ID registers */ -#define AA32_ID_SANITISED(name) { \ +#define ID_DESC(name) \ SYS_DESC(SYS_##name), \ - .access = access_id_reg, \ - .get_user = get_id_reg, \ - .set_user = set_id_reg, \ + ID_DESC_DEFAULT_CALLBACKS + +/* sys_reg_desc initialiser for known cpufeature ID registers */ +#define ID_SANITISED(name) { \ + ID_DESC(name), \ + .val = 0, \ +} + +/* sys_reg_desc initialiser for writable ID registers */ +#define ID_WRITABLE(name, mask) { \ + ID_DESC(name), \ + .val = mask, \ +} + +/* + * 32bit ID regs are fully writable when the guest is 32bit + * capable. Nothing in the KVM code should rely on 32bit features + * anyway, only 64bit, so let the VMM do its worse. + */ +#define AA32_ID_WRITABLE(name) { \ + ID_DESC(name), \ .visibility = aa32_id_visibility, \ + .val = GENMASK(31, 0), \ +} + +/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ +#define ID_FILTERED(sysreg, name, mask) { \ + ID_DESC(sysreg), \ + .set_user = set_##name, \ + .val = (mask), \ } /* @@ -1470,11 +2640,11 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, * (1 <= crm < 8, 0 <= Op2 < 8). */ #define ID_UNALLOCATED(crm, op2) { \ + .name = "S3_0_0_" #crm "_" #op2, \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ - .access = access_id_reg, \ - .get_user = get_id_reg, \ - .set_user = set_id_reg, \ - .visibility = raz_visibility \ + ID_DESC_DEFAULT_CALLBACKS, \ + .visibility = raz_visibility, \ + .val = 0, \ } /* @@ -1483,11 +2653,439 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, * RAZ for the guest. */ #define ID_HIDDEN(name) { \ - SYS_DESC(SYS_##name), \ - .access = access_id_reg, \ - .get_user = get_id_reg, \ - .set_user = set_id_reg, \ + ID_DESC(name), \ .visibility = raz_visibility, \ + .val = 0, \ +} + +static bool access_sp_el1(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_assign_sys_reg(vcpu, SP_EL1, p->regval); + else + p->regval = __vcpu_sys_reg(vcpu, SP_EL1); + + return true; +} + +static bool access_elr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, ELR_EL1); + else + p->regval = vcpu_read_sys_reg(vcpu, ELR_EL1); + + return true; +} + +static bool access_spsr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_assign_sys_reg(vcpu, SPSR_EL1, p->regval); + else + p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1); + + return true; +} + +static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_assign_sys_reg(vcpu, CNTKCTL_EL1, p->regval); + else + p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); + + return true; +} + +static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 val = r->val; + + if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) + val |= HCR_E2H; + + __vcpu_assign_sys_reg(vcpu, r->reg, val); + + return __vcpu_sys_reg(vcpu, r->reg); +} + +static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + unsigned int (*fn)(const struct kvm_vcpu *, + const struct sys_reg_desc *)) +{ + return el2_visibility(vcpu, rd) ?: fn(vcpu, rd); +} + +static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, sve_visibility); +} + +static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (el2_visibility(vcpu, rd) == 0 && + kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int sctlr2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_sctlr2(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int sctlr2_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, sctlr2_visibility); +} + +static bool access_zcr_el2(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + unsigned int vq; + + if (guest_hyp_sve_traps_enabled(vcpu)) { + kvm_inject_nested_sve_trap(vcpu); + return false; + } + + if (!p->is_write) { + p->regval = __vcpu_sys_reg(vcpu, ZCR_EL2); + return true; + } + + vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; + vq = min(vq, vcpu_sve_max_vq(vcpu)); + __vcpu_assign_sys_reg(vcpu, ZCR_EL2, vq - 1); + return true; +} + +static bool access_gic_vtr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = kvm_get_guest_vtr_el2(); + + return true; +} + +static bool access_gic_misr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_misr(vcpu); + + return true; +} + +static bool access_gic_eisr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_eisr(vcpu); + + return true; +} + +static bool access_gic_elrsr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = vgic_v3_get_elrsr(vcpu); + + return true; +} + +static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_s1poe(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1poe_visibility); +} + +static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_tcr2(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, tcr2_visibility); +} + +static unsigned int fgt2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (el2_visibility(vcpu, rd) == 0 && + kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, FGT2)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int fgt_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (el2_visibility(vcpu, rd) == 0 && + kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_s1pie(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1pie_visibility); +} + +static unsigned int cnthv_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (vcpu_has_nv(vcpu) && + !vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2_E2H0)) + return 0; + + return REG_HIDDEN; +} + +static bool access_mdcr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 hpmn, val, old = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!p->is_write) { + p->regval = old; + return true; + } + + val = p->regval; + hpmn = FIELD_GET(MDCR_EL2_HPMN, val); + + /* + * If HPMN is out of bounds, limit it to what we actually + * support. This matches the UNKNOWN definition of the field + * in that case, and keeps the emulation simple. Sort of. + */ + if (hpmn > vcpu->kvm->arch.nr_pmu_counters) { + hpmn = vcpu->kvm->arch.nr_pmu_counters; + u64p_replace_bits(&val, hpmn, MDCR_EL2_HPMN); + } + + __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); + + /* + * Request a reload of the PMU to enable/disable the counters + * affected by HPME. + */ + if ((old ^ val) & MDCR_EL2_HPME) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + return true; +} + +static bool access_ras(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + struct kvm *kvm = vcpu->kvm; + + switch(reg_to_encoding(r)) { + case SYS_ERXPFGCDN_EL1: + case SYS_ERXPFGCTL_EL1: + case SYS_ERXPFGF_EL1: + case SYS_ERXMISC2_EL1: + case SYS_ERXMISC3_EL1: + if (!(kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) || + (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) && + kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1)))) { + kvm_inject_undefined(vcpu); + return false; + } + break; + default: + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + } + + return trap_raz_wi(vcpu, p, r); +} + +/* + * For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and + * AIDR_EL1 as "invariant" registers, meaning userspace cannot change them. + * The values made visible to userspace were the register values of the boot + * CPU. + * + * At the same time, reads from these registers at EL1 previously were not + * trapped, allowing the guest to read the actual hardware value. On big-little + * machines, this means the VM can see different values depending on where a + * given vCPU got scheduled. + * + * These registers are now trapped as collateral damage from SME, and what + * follows attempts to give a user / guest view consistent with the existing + * ABI. + */ +static bool access_imp_id_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + /* + * Return the VM-scoped implementation ID register values if userspace + * has made them writable. + */ + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &vcpu->kvm->arch.flags)) + return access_id_reg(vcpu, p, r); + + /* + * Otherwise, fall back to the old behavior of returning the value of + * the current CPU. + */ + switch (reg_to_encoding(r)) { + case SYS_REVIDR_EL1: + p->regval = read_sysreg(revidr_el1); + break; + case SYS_AIDR_EL1: + p->regval = read_sysreg(aidr_el1); + break; + default: + WARN_ON_ONCE(1); + } + + return true; +} + +static u64 __ro_after_init boot_cpu_midr_val; +static u64 __ro_after_init boot_cpu_revidr_val; +static u64 __ro_after_init boot_cpu_aidr_val; + +static void init_imp_id_regs(void) +{ + boot_cpu_midr_val = read_sysreg(midr_el1); + boot_cpu_revidr_val = read_sysreg(revidr_el1); + boot_cpu_aidr_val = read_sysreg(aidr_el1); +} + +static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + switch (reg_to_encoding(r)) { + case SYS_MIDR_EL1: + return boot_cpu_midr_val; + case SYS_REVIDR_EL1: + return boot_cpu_revidr_val; + case SYS_AIDR_EL1: + return boot_cpu_aidr_val; + default: + KVM_BUG_ON(1, vcpu->kvm); + return 0; + } +} + +static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct kvm *kvm = vcpu->kvm; + u64 expected; + + guard(mutex)(&kvm->arch.config_lock); + + expected = read_id_reg(vcpu, r); + if (expected == val) + return 0; + + if (!test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags)) + return -EINVAL; + + /* + * Once the VM has started the ID registers are immutable. Reject the + * write if userspace tries to change it. + */ + if (kvm_vm_has_ran_once(kvm)) + return -EBUSY; + + /* + * Any value is allowed for the implementation ID registers so long as + * it is within the writable mask. + */ + if ((val & r->val) != val) + return -EINVAL; + + kvm_set_vm_id_reg(kvm, reg_to_encoding(r), val); + return 0; +} + +#define IMPLEMENTATION_ID(reg, mask) { \ + SYS_DESC(SYS_##reg), \ + .access = access_imp_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_imp_id_reg, \ + .reset = reset_imp_id_reg, \ + .val = mask, \ + } + +static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + __vcpu_assign_sys_reg(vcpu, r->reg, vcpu->kvm->arch.nr_pmu_counters); + return vcpu->kvm->arch.nr_pmu_counters; } /* @@ -1502,10 +3100,6 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, * guest... */ static const struct sys_reg_desc sys_reg_descs[] = { - { SYS_DESC(SYS_DC_ISW), access_dcsw }, - { SYS_DESC(SYS_DC_CSW), access_dcsw }, - { SYS_DESC(SYS_DC_CISW), access_dcsw }, - DBG_BCR_BVR_WCR_WVR_EL1(0), DBG_BCR_BVR_WCR_WVR_EL1(1), { SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 }, @@ -1528,7 +3122,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MDRAR_EL1), trap_raz_wi }, { SYS_DESC(SYS_OSLAR_EL1), trap_oslar_el1 }, { SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1, reset_val, OSLSR_EL1, - SYS_OSLSR_OSLM_IMPLEMENTED, .set_user = set_oslsr_el1, }, + OSLSR_EL1_OSLM_IMPLEMENTED, .set_user = set_oslsr_el1, }, { SYS_DESC(SYS_OSDLR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGPRCR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGCLAIMSET_EL1), trap_raz_wi }, @@ -1540,9 +3134,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { // DBGDTR[TR]X_EL0 share the same encoding { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, - { SYS_DESC(SYS_DBGVCR32_EL2), NULL, reset_val, DBGVCR32_EL2, 0 }, + { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, + IMPLEMENTATION_ID(MIDR_EL1, GENMASK_ULL(31, 0)), { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, + IMPLEMENTATION_ID(REVIDR_EL1, GENMASK_ULL(63, 0)), /* * ID regs: all ID_SANITISED() entries here must have corresponding @@ -1551,52 +3147,89 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ - AA32_ID_SANITISED(ID_PFR0_EL1), - AA32_ID_SANITISED(ID_PFR1_EL1), - { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, - .get_user = get_id_reg, .set_user = set_id_dfr0_el1, - .visibility = aa32_id_visibility, }, + AA32_ID_WRITABLE(ID_PFR0_EL1), + AA32_ID_WRITABLE(ID_PFR1_EL1), + { SYS_DESC(SYS_ID_DFR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_dfr0_el1, + .visibility = aa32_id_visibility, + .reset = read_sanitised_id_dfr0_el1, + .val = GENMASK(31, 0) }, ID_HIDDEN(ID_AFR0_EL1), - AA32_ID_SANITISED(ID_MMFR0_EL1), - AA32_ID_SANITISED(ID_MMFR1_EL1), - AA32_ID_SANITISED(ID_MMFR2_EL1), - AA32_ID_SANITISED(ID_MMFR3_EL1), + AA32_ID_WRITABLE(ID_MMFR0_EL1), + AA32_ID_WRITABLE(ID_MMFR1_EL1), + AA32_ID_WRITABLE(ID_MMFR2_EL1), + AA32_ID_WRITABLE(ID_MMFR3_EL1), /* CRm=2 */ - AA32_ID_SANITISED(ID_ISAR0_EL1), - AA32_ID_SANITISED(ID_ISAR1_EL1), - AA32_ID_SANITISED(ID_ISAR2_EL1), - AA32_ID_SANITISED(ID_ISAR3_EL1), - AA32_ID_SANITISED(ID_ISAR4_EL1), - AA32_ID_SANITISED(ID_ISAR5_EL1), - AA32_ID_SANITISED(ID_MMFR4_EL1), - AA32_ID_SANITISED(ID_ISAR6_EL1), + AA32_ID_WRITABLE(ID_ISAR0_EL1), + AA32_ID_WRITABLE(ID_ISAR1_EL1), + AA32_ID_WRITABLE(ID_ISAR2_EL1), + AA32_ID_WRITABLE(ID_ISAR3_EL1), + AA32_ID_WRITABLE(ID_ISAR4_EL1), + AA32_ID_WRITABLE(ID_ISAR5_EL1), + AA32_ID_WRITABLE(ID_MMFR4_EL1), + AA32_ID_WRITABLE(ID_ISAR6_EL1), /* CRm=3 */ - AA32_ID_SANITISED(MVFR0_EL1), - AA32_ID_SANITISED(MVFR1_EL1), - AA32_ID_SANITISED(MVFR2_EL1), + AA32_ID_WRITABLE(MVFR0_EL1), + AA32_ID_WRITABLE(MVFR1_EL1), + AA32_ID_WRITABLE(MVFR2_EL1), ID_UNALLOCATED(3,3), - AA32_ID_SANITISED(ID_PFR2_EL1), + AA32_ID_WRITABLE(ID_PFR2_EL1), ID_HIDDEN(ID_DFR1_EL1), - AA32_ID_SANITISED(ID_MMFR5_EL1), + AA32_ID_WRITABLE(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ /* CRm=4 */ - { SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg, - .get_user = get_id_reg, .set_user = set_id_aa64pfr0_el1, }, - ID_SANITISED(ID_AA64PFR1_EL1), - ID_UNALLOCATED(4,2), + ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1, + ~(ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SVE | + ID_AA64PFR0_EL1_AdvSIMD | + ID_AA64PFR0_EL1_FP)), + ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, + ~(ID_AA64PFR1_EL1_PFAR | + ID_AA64PFR1_EL1_MTEX | + ID_AA64PFR1_EL1_THE | + ID_AA64PFR1_EL1_GCS | + ID_AA64PFR1_EL1_MTE_frac | + ID_AA64PFR1_EL1_NMI | + ID_AA64PFR1_EL1_RNDR_trap | + ID_AA64PFR1_EL1_SME | + ID_AA64PFR1_EL1_RES0 | + ID_AA64PFR1_EL1_MPAM_frac | + ID_AA64PFR1_EL1_MTE)), + ID_WRITABLE(ID_AA64PFR2_EL1, + ID_AA64PFR2_EL1_FPMR | + ID_AA64PFR2_EL1_MTEFAR | + ID_AA64PFR2_EL1_MTESTOREONLY), ID_UNALLOCATED(4,3), - ID_SANITISED(ID_AA64ZFR0_EL1), + ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), ID_UNALLOCATED(4,6), - ID_UNALLOCATED(4,7), + ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), /* CRm=5 */ - { SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg, - .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, }, + /* + * Prior to FEAT_Debugv8.9, the architecture defines context-aware + * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). + * KVM does not trap + emulate the breakpoint registers, and as such + * cannot support a layout that misaligns with the underlying hardware. + * While it may be possible to describe a subset that aligns with + * hardware, just prevent changes to BRPs and CTX_CMPs altogether for + * simplicity. + * + * See DDI0487K.a, section D2.8.3 Breakpoint types and linking + * of breakpoints for more details. + */ + ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1, + ID_AA64DFR0_EL1_DoubleLock_MASK | + ID_AA64DFR0_EL1_WRPs_MASK | + ID_AA64DFR0_EL1_PMUVer_MASK | + ID_AA64DFR0_EL1_DebugVer_MASK), ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), @@ -1606,21 +3239,42 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(5,7), /* CRm=6 */ - ID_SANITISED(ID_AA64ISAR0_EL1), - ID_SANITISED(ID_AA64ISAR1_EL1), - ID_SANITISED(ID_AA64ISAR2_EL1), - ID_UNALLOCATED(6,3), + ID_WRITABLE(ID_AA64ISAR0_EL1, ~ID_AA64ISAR0_EL1_RES0), + ID_WRITABLE(ID_AA64ISAR1_EL1, ~(ID_AA64ISAR1_EL1_GPI | + ID_AA64ISAR1_EL1_GPA | + ID_AA64ISAR1_EL1_API | + ID_AA64ISAR1_EL1_APA)), + ID_WRITABLE(ID_AA64ISAR2_EL1, ~(ID_AA64ISAR2_EL1_RES0 | + ID_AA64ISAR2_EL1_APA3 | + ID_AA64ISAR2_EL1_GPA3)), + ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT | + ID_AA64ISAR3_EL1_LSFE | + ID_AA64ISAR3_EL1_FAMINMAX)), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), ID_UNALLOCATED(6,6), ID_UNALLOCATED(6,7), /* CRm=7 */ - ID_SANITISED(ID_AA64MMFR0_EL1), - ID_SANITISED(ID_AA64MMFR1_EL1), - ID_SANITISED(ID_AA64MMFR2_EL1), - ID_UNALLOCATED(7,3), - ID_UNALLOCATED(7,4), + ID_FILTERED(ID_AA64MMFR0_EL1, id_aa64mmfr0_el1, + ~(ID_AA64MMFR0_EL1_RES0 | + ID_AA64MMFR0_EL1_ASIDBITS)), + ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | + ID_AA64MMFR1_EL1_XNX | + ID_AA64MMFR1_EL1_VH | + ID_AA64MMFR1_EL1_VMIDBits)), + ID_FILTERED(ID_AA64MMFR2_EL1, + id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 | + ID_AA64MMFR2_EL1_EVT | + ID_AA64MMFR2_EL1_FWB | + ID_AA64MMFR2_EL1_IDS | + ID_AA64MMFR2_EL1_NV | + ID_AA64MMFR2_EL1_CCIDX)), + ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_SCTLRX | + ID_AA64MMFR3_EL1_S1PIE | + ID_AA64MMFR3_EL1_S1POE)), + ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), ID_UNALLOCATED(7,7), @@ -1628,6 +3282,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 }, { SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 }, { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 }, + { SYS_DESC(SYS_SCTLR2_EL1), access_vm_reg, reset_val, SCTLR2_EL1, 0, + .visibility = sctlr2_visibility }, MTE_REG(RGSR_EL1), MTE_REG(GCR_EL1), @@ -1639,6 +3295,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, + { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0, + .visibility = tcr2_visibility }, PTRAUTH_KEY(APIA), PTRAUTH_KEY(APIB), @@ -1646,18 +3304,28 @@ static const struct sys_reg_desc sys_reg_descs[] = { PTRAUTH_KEY(APDB), PTRAUTH_KEY(APGA), + { SYS_DESC(SYS_SPSR_EL1), access_spsr}, + { SYS_DESC(SYS_ELR_EL1), access_elr}, + + { SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, - { SYS_DESC(SYS_ERRIDR_EL1), trap_raz_wi }, - { SYS_DESC(SYS_ERRSELR_EL1), trap_raz_wi }, - { SYS_DESC(SYS_ERXFR_EL1), trap_raz_wi }, - { SYS_DESC(SYS_ERXCTLR_EL1), trap_raz_wi }, - { SYS_DESC(SYS_ERXSTATUS_EL1), trap_raz_wi }, - { SYS_DESC(SYS_ERXADDR_EL1), trap_raz_wi }, - { SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi }, - { SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi }, + { SYS_DESC(SYS_ERRIDR_EL1), access_ras }, + { SYS_DESC(SYS_ERRSELR_EL1), access_ras }, + { SYS_DESC(SYS_ERXFR_EL1), access_ras }, + { SYS_DESC(SYS_ERXCTLR_EL1), access_ras }, + { SYS_DESC(SYS_ERXSTATUS_EL1), access_ras }, + { SYS_DESC(SYS_ERXADDR_EL1), access_ras }, + { SYS_DESC(SYS_ERXPFGF_EL1), access_ras }, + { SYS_DESC(SYS_ERXPFGCTL_EL1), access_ras }, + { SYS_DESC(SYS_ERXPFGCDN_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC0_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC1_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC2_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC3_EL1), access_ras }, MTE_REG(TFSR_EL1), MTE_REG(TFSRE0_EL1), @@ -1676,89 +3344,133 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access }, { SYS_DESC(SYS_PMBPTR_EL1), undef_access }, { SYS_DESC(SYS_PMBSR_EL1), undef_access }, + { SYS_DESC(SYS_PMSDSFR_EL1), undef_access }, /* PMBIDR_EL1 is not trapped */ - { PMU_SYS_REG(SYS_PMINTENSET_EL1), - .access = access_pminten, .reg = PMINTENSET_EL1 }, - { PMU_SYS_REG(SYS_PMINTENCLR_EL1), - .access = access_pminten, .reg = PMINTENSET_EL1 }, + { PMU_SYS_REG(PMINTENSET_EL1), + .access = access_pminten, .reg = PMINTENSET_EL1, + .get_user = get_pmreg, .set_user = set_pmreg }, + { PMU_SYS_REG(PMINTENCLR_EL1), + .access = access_pminten, .reg = PMINTENSET_EL1, + .get_user = get_pmreg, .set_user = set_pmreg }, { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, + { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1, + .visibility = s1pie_visibility }, + { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1, + .visibility = s1pie_visibility }, + { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, { SYS_DESC(SYS_LOREA_EL1), trap_loregion }, { SYS_DESC(SYS_LORN_EL1), trap_loregion }, { SYS_DESC(SYS_LORC_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAMIDR_EL1), undef_access }, { SYS_DESC(SYS_LORID_EL1), trap_loregion }, - { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 }, + { SYS_DESC(SYS_MPAM1_EL1), undef_access }, + { SYS_DESC(SYS_MPAM0_EL1), undef_access }, + { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, - { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, + { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, - { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, + { SYS_DESC(SYS_ACCDATA_EL1), undef_access }, + { SYS_DESC(SYS_SCXTNUM_EL1), undef_access }, { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, - { SYS_DESC(SYS_CLIDR_EL1), access_clidr }, + { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1, + .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 }, + { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, + IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)), { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, - { SYS_DESC(SYS_CTR_EL0), access_ctr }, - { SYS_DESC(SYS_SVCR), undef_access }, - - { PMU_SYS_REG(SYS_PMCR_EL0), .access = access_pmcr, - .reset = reset_pmcr, .reg = PMCR_EL0 }, - { PMU_SYS_REG(SYS_PMCNTENSET_EL0), - .access = access_pmcnten, .reg = PMCNTENSET_EL0 }, - { PMU_SYS_REG(SYS_PMCNTENCLR_EL0), - .access = access_pmcnten, .reg = PMCNTENSET_EL0 }, - { PMU_SYS_REG(SYS_PMOVSCLR_EL0), - .access = access_pmovs, .reg = PMOVSSET_EL0 }, + ID_FILTERED(CTR_EL0, ctr_el0, + CTR_EL0_DIC_MASK | + CTR_EL0_IDC_MASK | + CTR_EL0_DminLine_MASK | + CTR_EL0_L1Ip_MASK | + CTR_EL0_IminLine_MASK), + { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, + { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, + + { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, + .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, + { PMU_SYS_REG(PMCNTENSET_EL0), + .access = access_pmcnten, .reg = PMCNTENSET_EL0, + .get_user = get_pmreg, .set_user = set_pmreg }, + { PMU_SYS_REG(PMCNTENCLR_EL0), + .access = access_pmcnten, .reg = PMCNTENSET_EL0, + .get_user = get_pmreg, .set_user = set_pmreg }, + { PMU_SYS_REG(PMOVSCLR_EL0), + .access = access_pmovs, .reg = PMOVSSET_EL0, + .get_user = get_pmreg, .set_user = set_pmreg }, /* * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was * previously (and pointlessly) advertised in the past... */ - { PMU_SYS_REG(SYS_PMSWINC_EL0), + { PMU_SYS_REG(PMSWINC_EL0), .get_user = get_raz_reg, .set_user = set_wi_reg, .access = access_pmswinc, .reset = NULL }, - { PMU_SYS_REG(SYS_PMSELR_EL0), + { PMU_SYS_REG(PMSELR_EL0), .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, - { PMU_SYS_REG(SYS_PMCEID0_EL0), + { PMU_SYS_REG(PMCEID0_EL0), .access = access_pmceid, .reset = NULL }, - { PMU_SYS_REG(SYS_PMCEID1_EL0), + { PMU_SYS_REG(PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, - { PMU_SYS_REG(SYS_PMCCNTR_EL0), - .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, - { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), + { PMU_SYS_REG(PMCCNTR_EL0), + .access = access_pmu_evcntr, .reset = reset_unknown, + .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr, + .set_user = set_pmu_evcntr }, + { PMU_SYS_REG(PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, - { PMU_SYS_REG(SYS_PMXEVCNTR_EL0), + { PMU_SYS_REG(PMXEVCNTR_EL0), .access = access_pmu_evcntr, .reset = NULL }, /* * PMUSERENR_EL0 resets as unknown in 64bit mode while it resets as zero * in 32bit mode. Here we choose to reset it as zero for consistency. */ - { PMU_SYS_REG(SYS_PMUSERENR_EL0), .access = access_pmuserenr, + { PMU_SYS_REG(PMUSERENR_EL0), .access = access_pmuserenr, .reset = reset_val, .reg = PMUSERENR_EL0, .val = 0 }, - { PMU_SYS_REG(SYS_PMOVSSET_EL0), - .access = access_pmovs, .reg = PMOVSSET_EL0 }, + { PMU_SYS_REG(PMOVSSET_EL0), + .access = access_pmovs, .reg = PMOVSSET_EL0, + .get_user = get_pmreg, .set_user = set_pmreg }, + { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, @@ -1838,9 +3550,19 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVTYPER1_EL0(14), AMU_AMEVTYPER1_EL0(15), + { SYS_DESC(SYS_CNTPCT_EL0), .access = access_arch_timer, + .get_user = arch_timer_get_user, .set_user = arch_timer_set_user }, + { SYS_DESC(SYS_CNTVCT_EL0), .access = access_arch_timer, + .get_user = arch_timer_get_user, .set_user = arch_timer_set_user }, + { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, - { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, - { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, + TIMER_REG(CNTP_CTL_EL0, NULL), + TIMER_REG(CNTP_CVAL_EL0, NULL), + + { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, + TIMER_REG(CNTV_CTL_EL0, NULL), + TIMER_REG(CNTV_CVAL_EL0, NULL), /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), @@ -1910,12 +3632,650 @@ static const struct sys_reg_desc sys_reg_descs[] = { * PMCCFILTR_EL0 resets as unknown in 64bit mode while it resets as zero * in 32bit mode. Here we choose to reset it as zero for consistency. */ - { PMU_SYS_REG(SYS_PMCCFILTR_EL0), .access = access_pmu_evtyper, + { PMU_SYS_REG(PMCCFILTR_EL0), .access = access_pmu_evtyper, .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 }, - { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 }, - { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 }, - { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 }, + EL2_REG_VNCR(VPIDR_EL2, reset_unknown, 0), + EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0), + EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), + EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), + EL2_REG_FILTERED(SCTLR2_EL2, access_vm_reg, reset_val, 0, + sctlr2_el2_visibility), + EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), + EL2_REG(MDCR_EL2, access_mdcr, reset_mdcr, 0), + EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), + EL2_REG_VNCR(HSTR_EL2, reset_val, 0), + EL2_REG_VNCR_FILT(HFGRTR_EL2, fgt_visibility), + EL2_REG_VNCR_FILT(HFGWTR_EL2, fgt_visibility), + EL2_REG_VNCR(HFGITR_EL2, reset_val, 0), + EL2_REG_VNCR(HACR_EL2, reset_val, 0), + + EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0, + sve_el2_visibility), + + EL2_REG_VNCR(HCRX_EL2, reset_val, 0), + + EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), + EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), + EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), + EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1, + tcr2_el2_visibility), + EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), + EL2_REG_VNCR(VTCR_EL2, reset_val, 0), + EL2_REG_FILTERED(VNCR_EL2, bad_vncr_trap, reset_val, 0, + vncr_el2_visibility), + + { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, + EL2_REG_VNCR_FILT(HDFGRTR2_EL2, fgt2_visibility), + EL2_REG_VNCR_FILT(HDFGWTR2_EL2, fgt2_visibility), + EL2_REG_VNCR_FILT(HFGRTR2_EL2, fgt2_visibility), + EL2_REG_VNCR_FILT(HFGWTR2_EL2, fgt2_visibility), + EL2_REG_VNCR_FILT(HDFGRTR_EL2, fgt_visibility), + EL2_REG_VNCR_FILT(HDFGWTR_EL2, fgt_visibility), + EL2_REG_VNCR_FILT(HAFGRTR_EL2, fgt_visibility), + EL2_REG_VNCR_FILT(HFGITR2_EL2, fgt2_visibility), + EL2_REG_REDIR(SPSR_EL2, reset_val, 0), + EL2_REG_REDIR(ELR_EL2, reset_val, 0), + { SYS_DESC(SYS_SP_EL1), access_sp_el1}, + + /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ + { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi }, + + { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 }, + EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), + EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), + EL2_REG_REDIR(ESR_EL2, reset_val, 0), + EL2_REG_VNCR(VSESR_EL2, reset_unknown, 0), + { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 }, + + EL2_REG_REDIR(FAR_EL2, reset_val, 0), + EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), + + EL2_REG(MAIR_EL2, access_rw, reset_val, 0), + EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0, + s1pie_el2_visibility), + EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0, + s1pie_el2_visibility), + EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0, + s1poe_el2_visibility), + EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_MPAMHCR_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access }, + { SYS_DESC(SYS_MPAM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access }, + + EL2_REG(VBAR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_RVBAR_EL2), undef_access }, + { SYS_DESC(SYS_RMR_EL2), undef_access }, + EL2_REG_VNCR(VDISR_EL2, reset_unknown, 0), + + EL2_REG_VNCR_GICv3(ICH_AP0R0_EL2), + EL2_REG_VNCR_GICv3(ICH_AP0R1_EL2), + EL2_REG_VNCR_GICv3(ICH_AP0R2_EL2), + EL2_REG_VNCR_GICv3(ICH_AP0R3_EL2), + EL2_REG_VNCR_GICv3(ICH_AP1R0_EL2), + EL2_REG_VNCR_GICv3(ICH_AP1R1_EL2), + EL2_REG_VNCR_GICv3(ICH_AP1R2_EL2), + EL2_REG_VNCR_GICv3(ICH_AP1R3_EL2), + + { SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre }, + + EL2_REG_VNCR_GICv3(ICH_HCR_EL2), + { SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr }, + { SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr }, + { SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr }, + { SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr }, + EL2_REG_VNCR_GICv3(ICH_VMCR_EL2), + + EL2_REG_VNCR_GICv3(ICH_LR0_EL2), + EL2_REG_VNCR_GICv3(ICH_LR1_EL2), + EL2_REG_VNCR_GICv3(ICH_LR2_EL2), + EL2_REG_VNCR_GICv3(ICH_LR3_EL2), + EL2_REG_VNCR_GICv3(ICH_LR4_EL2), + EL2_REG_VNCR_GICv3(ICH_LR5_EL2), + EL2_REG_VNCR_GICv3(ICH_LR6_EL2), + EL2_REG_VNCR_GICv3(ICH_LR7_EL2), + EL2_REG_VNCR_GICv3(ICH_LR8_EL2), + EL2_REG_VNCR_GICv3(ICH_LR9_EL2), + EL2_REG_VNCR_GICv3(ICH_LR10_EL2), + EL2_REG_VNCR_GICv3(ICH_LR11_EL2), + EL2_REG_VNCR_GICv3(ICH_LR12_EL2), + EL2_REG_VNCR_GICv3(ICH_LR13_EL2), + EL2_REG_VNCR_GICv3(ICH_LR14_EL2), + EL2_REG_VNCR_GICv3(ICH_LR15_EL2), + + EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), + EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), + + EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), + EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, + TIMER_REG(CNTHP_CTL_EL2, el2_visibility), + TIMER_REG(CNTHP_CVAL_EL2, el2_visibility), + + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer, .visibility = cnthv_visibility }, + TIMER_REG(CNTHV_CTL_EL2, cnthv_visibility), + TIMER_REG(CNTHV_CVAL_EL2, cnthv_visibility), + + { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, + + { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer }, + + { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer }, + + EL2_REG(SP_EL2, NULL, reset_unknown, 0), +}; + +static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (__kvm_at_s1e01(vcpu, op, p->regval)) + return false; + + return true; +} + +static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + /* There is no FGT associated with AT S1E2A :-( */ + if (op == OP_AT_S1E2A && + !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + if (__kvm_at_s1e2(vcpu, op, p->regval)) + return false; + + return true; +} + +static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (__kvm_at_s12(vcpu, op, p->regval)) + return false; + + return true; +} + +static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + + if (sys_reg_CRn(instr) == TLBI_CRn_nXS && + !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) + return false; + + if (CRm == TLBI_CRm_nROS && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + return true; +} + +static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + write_lock(&vcpu->kvm->mmu_lock); + + /* + * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the + * corresponding VMIDs. + */ + kvm_nested_s2_unmap(vcpu->kvm, true); + + write_unlock(&vcpu->kvm->mmu_lock); + + return true; +} + +static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + u8 Op2 = sys_reg_Op2(instr); + + if (sys_reg_CRn(instr) == TLBI_CRn_nXS && + !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) + return false; + + if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + return true; +} + +/* Only defined here as this is an internal "abstraction" */ +union tlbi_info { + struct { + u64 start; + u64 size; + } range; + + struct { + u64 addr; + } ipa; + + struct { + u64 addr; + u32 encoding; + } va; +}; + +static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + /* + * The unmap operation is allowed to drop the MMU lock and block, which + * means that @mmu could be used for a different context than the one + * currently being invalidated. + * + * This behavior is still safe, as: + * + * 1) The vCPU(s) that recycled the MMU are responsible for invalidating + * the entire MMU before reusing it, which still honors the intent + * of a TLBI. + * + * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC + * and ERET to the guest), other vCPUs are allowed to use stale + * translations. + * + * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and + * at worst may cause more aborts for shadow stage-2 fills. + * + * Dropping the MMU lock also implies that shadow stage-2 fills could + * happen behind the back of the TLBI. This is still safe, though, as + * the L1 needs to put its stage-2 in a consistent state before doing + * the TLBI. + */ + kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); +} + +static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 limit, vttbr; + + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm)); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .range = { + .start = 0, + .size = limit, + }, + }, + s2_mmu_unmap_range); + + return true; +} + +static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + u64 base, range; + + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + /* + * Because the shadow S2 structure doesn't necessarily reflect that + * of the guest's S2 (different base granule size, for example), we + * decide to ignore TTL and only use the described range. + */ + base = decode_range_tlbi(p->regval, &range, NULL); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .range = { + .start = base, + .size = range, + }, + }, + s2_mmu_unmap_range); + + return true; +} + +static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + unsigned long max_size; + u64 base_addr; + + /* + * We drop a number of things from the supplied value: + * + * - NS bit: we're non-secure only. + * + * - IPA[51:48]: We don't support 52bit IPA just yet... + * + * And of course, adjust the IPA to be on an actual address. + */ + base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12; + max_size = compute_tlb_inval_range(mmu, info->ipa.addr); + base_addr &= ~(max_size - 1); + + /* + * See comment in s2_mmu_unmap_range() for why this is allowed to + * reschedule. + */ + kvm_stage2_unmap_range(mmu, base_addr, max_size, true); +} + +static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .ipa = { + .addr = p->regval, + }, + }, + s2_mmu_unmap_ipa); + + return true; +} + +static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding)); +} + +static bool handle_tlbi_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (!kvm_supported_tlbi_s1e2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); + return true; +} + +static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + /* + * If we're here, this is because we've trapped on a EL1 TLBI + * instruction that affects the EL1 translation regime while + * we're running in a context that doesn't allow us to let the + * HW do its thing (aka vEL2): + * + * - HCR_EL2.E2H == 0 : a non-VHE guest + * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode + * + * Another possibility is that we are invalidating the EL2 context + * using EL1 instructions, but that we landed here because we need + * additional invalidation for structures that are not held in the + * CPU TLBs (such as the VNCR pseudo-TLB and its EL2 mapping). In + * that case, we are guaranteed that HCR_EL2.{E2H,TGE} == { 1, 1 } + * as we don't allow an NV-capable L1 in a nVHE configuration. + * + * We don't expect these helpers to ever be called when running + * in a vEL1 context. + */ + + WARN_ON(!vcpu_is_el2(vcpu)); + + if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) { + kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); + return true; + } + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, + get_vmid(__vcpu_sys_reg(vcpu, VTTBR_EL2)), + &(union tlbi_info) { + .va = { + .addr = p->regval, + .encoding = sys_encoding, + }, + }, + s2_mmu_tlbi_s1e1); + + return true; +} + +#define SYS_INSN(insn, access_fn) \ + { \ + SYS_DESC(OP_##insn), \ + .access = (access_fn), \ + } + +static struct sys_reg_desc sys_insn_descs[] = { + { SYS_DESC(SYS_DC_ISW), access_dcsw }, + { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, + { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, + + SYS_INSN(AT_S1E1R, handle_at_s1e01), + SYS_INSN(AT_S1E1W, handle_at_s1e01), + SYS_INSN(AT_S1E0R, handle_at_s1e01), + SYS_INSN(AT_S1E0W, handle_at_s1e01), + SYS_INSN(AT_S1E1RP, handle_at_s1e01), + SYS_INSN(AT_S1E1WP, handle_at_s1e01), + + { SYS_DESC(SYS_DC_CSW), access_dcsw }, + { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CISW), access_dcsw }, + { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, + + SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1), + + SYS_INSN(AT_S1E2R, handle_at_s1e2), + SYS_INSN(AT_S1E2W, handle_at_s1e2), + SYS_INSN(AT_S12E1R, handle_at_s12), + SYS_INSN(AT_S12E1W, handle_at_s12), + SYS_INSN(AT_S12E0R, handle_at_s12), + SYS_INSN(AT_S12E0W, handle_at_s12), + SYS_INSN(AT_S1E2A, handle_at_s1e2), + + SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), + + SYS_INSN(TLBI_ALLE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE1OS, handle_alle1is), + SYS_INSN(TLBI_VALE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), + + SYS_INSN(TLBI_RVAE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2IS, handle_tlbi_el2), + + SYS_INSN(TLBI_ALLE1IS, handle_alle1is), + + SYS_INSN(TLBI_VALE2IS, handle_tlbi_el2), + + SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), + SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), + SYS_INSN(TLBI_RVAE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_RVAE2, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2, handle_tlbi_el2), + + SYS_INSN(TLBI_ALLE1, handle_alle1is), + + SYS_INSN(TLBI_VALE2, handle_tlbi_el2), + + SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), + + SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), + + SYS_INSN(TLBI_ALLE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), + SYS_INSN(TLBI_VALE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), + + SYS_INSN(TLBI_RVAE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2ISNXS, handle_tlbi_el2), + + SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), + SYS_INSN(TLBI_VALE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), + SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), + SYS_INSN(TLBI_RVAE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVAE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), + SYS_INSN(TLBI_VALE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), }; static bool trap_dbgdidr(struct kvm_vcpu *vcpu, @@ -1925,14 +4285,14 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, if (p->is_write) { return ignore_write(vcpu, p); } else { - u64 dfr = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); - u64 pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - u32 el3 = !!cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR0_EL1_EL3_SHIFT); - - p->regval = ((((dfr >> ID_AA64DFR0_EL1_WRPs_SHIFT) & 0xf) << 28) | - (((dfr >> ID_AA64DFR0_EL1_BRPs_SHIFT) & 0xf) << 24) | - (((dfr >> ID_AA64DFR0_EL1_CTX_CMPs_SHIFT) & 0xf) << 20) - | (6 << 16) | (1 << 15) | (el3 << 14) | (el3 << 12)); + u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1); + u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP); + + p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) | + (SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr) << 24) | + (SYS_FIELD_GET(ID_AA64DFR0_EL1, CTX_CMPs, dfr) << 20) | + (SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, dfr) << 16) | + (1 << 15) | (el3 << 14) | (el3 << 12)); return true; } } @@ -1946,18 +4306,20 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, * None of the other registers share their location, so treat them as * if they were 64bit. */ -#define DBG_BCR_BVR_WCR_WVR(n) \ - /* DBGBVRn */ \ - { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \ - /* DBGBCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \ - /* DBGWVRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \ - /* DBGWCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n } - -#define DBGBXVR(n) \ - { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_bvr, NULL, n } +#define DBG_BCR_BVR_WCR_WVR(n) \ + /* DBGBVRn */ \ + { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), \ + trap_dbg_wb_reg, NULL, n }, \ + /* DBGBCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWVRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n } + +#define DBGBXVR(n) \ + { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), \ + trap_dbg_wb_reg, NULL, n } /* * Trapped cp14 registers. We generally ignore most of the external @@ -2092,6 +4454,7 @@ static const struct sys_reg_desc cp15_regs[] = { /* TTBCR2 */ { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, + { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, /* DFSR */ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, @@ -2141,8 +4504,28 @@ static const struct sys_reg_desc cp15_regs[] = { /* AMAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, - /* ICC_SRE */ - { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, + { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, @@ -2219,6 +4602,10 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, + + /* CCSIDR2 */ + { Op1(1), CRn( 0), CRm( 0), Op2(2), undef_access }, + { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 }, }; @@ -2226,25 +4613,31 @@ static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, { CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr }, { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */ + { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer }, { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ + { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer }, { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer }, }; static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, - bool is_32) + bool reset_check) { unsigned int i; for (i = 0; i < n; i++) { - if (!is_32 && table[i].reg && !table[i].reset) { - kvm_err("sys_reg table %pS entry %d lacks reset\n", &table[i], i); + if (reset_check && table[i].reg && !table[i].reset) { + kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n", + &table[i], i, table[i].name); return false; } if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { - kvm_err("sys_reg table %pS entry %d out of order\n", &table[i - 1], i - 1); + kvm_err("sys_reg table %pS entry %d (%s -> %s) out of order\n", + &table[i], i, table[i - 1].name, table[i].name); return false; } } @@ -2341,7 +4734,8 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, /** * kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer - * @run: The kvm_run struct + * @global: &struct sys_reg_desc + * @nr_global: size of the @global array */ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, const struct sys_reg_desc *global, @@ -2430,7 +4824,7 @@ static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params) return true; kvm_pr_unimpl("Unhandled cp10 register %s: %u\n", - params->is_write ? "write" : "read", reg_id); + str_write_read(params->is_write), reg_id); return false; } @@ -2508,7 +4902,9 @@ static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu, /** * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer - * @run: The kvm_run struct + * @params: &struct sys_reg_params + * @global: &struct sys_reg_desc + * @nr_global: size of the @global array */ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, struct sys_reg_params *params, @@ -2544,9 +4940,13 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) * Certain AArch32 ID registers are handled by rerouting to the AArch64 * system register table. Registers in the ID range where CRm=0 are * excluded from this scheme as they do not trivially map into AArch64 - * system register encodings. + * system register encodings, except for AIDR/REVIDR. */ - if (params.Op1 == 0 && params.CRn == 0 && params.CRm) + if (params.Op1 == 0 && params.CRn == 0 && + (params.CRm || params.Op2 == 6 /* REVIDR */)) + return kvm_emulate_cp15_id_reg(vcpu, ¶ms); + if (params.Op1 == 1 && params.CRn == 0 && + params.CRm == 0 && params.Op2 == 7 /* AIDR */) return kvm_emulate_cp15_id_reg(vcpu, ¶ms); return kvm_handle_cp_32(vcpu, ¶ms, cp15_regs, ARRAY_SIZE(cp15_regs)); @@ -2566,12 +4966,6 @@ int kvm_handle_cp14_32(struct kvm_vcpu *vcpu) return kvm_handle_cp_32(vcpu, ¶ms, cp14_regs, ARRAY_SIZE(cp14_regs)); } -static bool is_imp_def_sys_reg(struct sys_reg_params *params) -{ - // See ARM DDI 0487E.a, section D12.3.2 - return params->Op0 == 3 && (params->CRn & 0b1011) == 0b1011; -} - /** * emulate_sys_reg - Emulate a guest access to an AArch64 system register * @vcpu: The VCPU pointer @@ -2580,26 +4974,145 @@ static bool is_imp_def_sys_reg(struct sys_reg_params *params) * Return: true if the system register access was successful, false otherwise. */ static bool emulate_sys_reg(struct kvm_vcpu *vcpu, - struct sys_reg_params *params) + struct sys_reg_params *params) { const struct sys_reg_desc *r; r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); - if (likely(r)) { perform_access(vcpu, params, r); return true; } - if (is_imp_def_sys_reg(params)) { - kvm_inject_undefined(vcpu); + print_sys_reg_msg(params, + "Unsupported guest sys_reg access at: %lx [%08lx]\n", + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); + kvm_inject_undefined(vcpu); + + return false; +} + +static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos) +{ + unsigned long i, idreg_idx = 0; + + for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *r = &sys_reg_descs[i]; + + if (!is_vm_ftr_id_reg(reg_to_encoding(r))) + continue; + + if (idreg_idx == pos) + return r; + + idreg_idx++; + } + + return NULL; +} + +static void *idregs_debug_start(struct seq_file *s, loff_t *pos) +{ + struct kvm *kvm = s->private; + u8 *iter; + + mutex_lock(&kvm->arch.config_lock); + + iter = &kvm->arch.idreg_debugfs_iter; + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) && + *iter == (u8)~0) { + *iter = *pos; + if (!idregs_debug_find(kvm, *iter)) + iter = NULL; } else { - print_sys_reg_msg(params, - "Unsupported guest sys_reg access at: %lx [%08lx]\n", - *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); - kvm_inject_undefined(vcpu); + iter = ERR_PTR(-EBUSY); } - return false; + + mutex_unlock(&kvm->arch.config_lock); + + return iter; +} + +static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kvm *kvm = s->private; + + (*pos)++; + + if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) { + kvm->arch.idreg_debugfs_iter++; + + return &kvm->arch.idreg_debugfs_iter; + } + + return NULL; +} + +static void idregs_debug_stop(struct seq_file *s, void *v) +{ + struct kvm *kvm = s->private; + + if (IS_ERR(v)) + return; + + mutex_lock(&kvm->arch.config_lock); + + kvm->arch.idreg_debugfs_iter = ~0; + + mutex_unlock(&kvm->arch.config_lock); +} + +static int idregs_debug_show(struct seq_file *s, void *v) +{ + const struct sys_reg_desc *desc; + struct kvm *kvm = s->private; + + desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter); + + if (!desc->name) + return 0; + + seq_printf(s, "%20s:\t%016llx\n", + desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc))); + + return 0; +} + +static const struct seq_operations idregs_debug_sops = { + .start = idregs_debug_start, + .next = idregs_debug_next, + .stop = idregs_debug_stop, + .show = idregs_debug_show, +}; + +DEFINE_SEQ_ATTRIBUTE(idregs_debug); + +void kvm_sys_regs_create_debugfs(struct kvm *kvm) +{ + kvm->arch.idreg_debugfs_iter = ~0; + + debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm, + &idregs_debug_fops); +} + +static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg) +{ + u32 id = reg_to_encoding(reg); + struct kvm *kvm = vcpu->kvm; + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg)); +} + +static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *reg) +{ + if (kvm_vcpu_initialized(vcpu)) + return; + + reg->reset(vcpu, reg); } /** @@ -2611,33 +5124,66 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu, */ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; unsigned long i; - for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) - if (sys_reg_descs[i].reset) - sys_reg_descs[i].reset(vcpu, &sys_reg_descs[i]); + for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *r = &sys_reg_descs[i]; + + if (!r->reset) + continue; + + if (is_vm_ftr_id_reg(reg_to_encoding(r))) + reset_vm_ftr_id_reg(vcpu, r); + else if (is_vcpu_ftr_id_reg(reg_to_encoding(r))) + reset_vcpu_ftr_id_reg(vcpu, r); + else + r->reset(vcpu, r); + + if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) + __vcpu_rmw_sys_reg(vcpu, r->reg, |=, 0); + } + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); + + if (kvm_vcpu_has_pmu(vcpu)) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); } /** - * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access + * kvm_handle_sys_reg -- handles a system instruction or mrs/msr instruction + * trap on a guest execution * @vcpu: The VCPU pointer */ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) { + const struct sys_reg_desc *desc = NULL; struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); + int sr_idx; trace_kvm_handle_sys_reg(esr); + if (triage_sysreg_trap(vcpu, &sr_idx)) + return 1; + params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); - if (!emulate_sys_reg(vcpu, ¶ms)) - return 1; + /* System registers have Op0=={2,3}, as per DDI487 J.a C5.1.2 */ + if (params.Op0 == 2 || params.Op0 == 3) + desc = &sys_reg_descs[sr_idx]; + else + desc = &sys_insn_descs[sr_idx]; + + perform_access(vcpu, ¶ms, desc); - if (!params.is_write) + /* Read from system register? */ + if (!params.is_write && + (params.Op0 == 2 || params.Op0 == 3)) vcpu_set_reg(vcpu, Rt, params.regval); + return 1; } @@ -2707,99 +5253,7 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, return r; } -/* - * These are the invariant sys_reg registers: we let the guest see the - * host versions of these, so they're part of the guest state. - * - * A future CPU may provide a mechanism to present different values to - * the guest, or a future kvm may trap them. - */ - -#define FUNCTION_INVARIANT(reg) \ - static void get_##reg(struct kvm_vcpu *v, \ - const struct sys_reg_desc *r) \ - { \ - ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \ - } - -FUNCTION_INVARIANT(midr_el1) -FUNCTION_INVARIANT(revidr_el1) -FUNCTION_INVARIANT(clidr_el1) -FUNCTION_INVARIANT(aidr_el1) - -static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) -{ - ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0); -} - -/* ->val is filled in by kvm_sys_reg_table_init() */ -static struct sys_reg_desc invariant_sys_regs[] = { - { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, - { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 }, - { SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 }, - { SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 }, - { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 }, -}; - -static int get_invariant_sys_reg(u64 id, u64 __user *uaddr) -{ - const struct sys_reg_desc *r; - - r = get_reg_by_id(id, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); - if (!r) - return -ENOENT; - - return put_user(r->val, uaddr); -} - -static int set_invariant_sys_reg(u64 id, u64 __user *uaddr) -{ - const struct sys_reg_desc *r; - u64 val; - - r = get_reg_by_id(id, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); - if (!r) - return -ENOENT; - - if (get_user(val, uaddr)) - return -EFAULT; - - /* This is what we mean by invariant: you can't change it. */ - if (r->val != val) - return -EINVAL; - - return 0; -} - -static bool is_valid_cache(u32 val) -{ - u32 level, ctype; - - if (val >= CSSELR_MAX) - return false; - - /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */ - level = (val >> 1); - ctype = (cache_levels >> (level * 3)) & 7; - - switch (ctype) { - case 0: /* No cache */ - return false; - case 1: /* Instruction cache only */ - return (val & 1); - case 2: /* Data cache only */ - case 4: /* Unified cache */ - return !(val & 1); - case 3: /* Separate instruction and data caches */ - return true; - default: /* Reserved: we can't know instruction or data. */ - return false; - } -} - -static int demux_c15_get(u64 id, void __user *uaddr) +static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val; u32 __user *uval = uaddr; @@ -2815,16 +5269,16 @@ static int demux_c15_get(u64 id, void __user *uaddr) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; - if (!is_valid_cache(val)) + if (val >= CSSELR_MAX) return -ENOENT; - return put_user(get_ccsidr(val), uval); + return put_user(get_ccsidr(vcpu, val), uval); default: return -ENOENT; } } -static int demux_c15_set(u64 id, void __user *uaddr) +static int demux_c15_set(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val, newval; u32 __user *uval = uaddr; @@ -2840,31 +5294,41 @@ static int demux_c15_set(u64 id, void __user *uaddr) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; - if (!is_valid_cache(val)) + if (val >= CSSELR_MAX) return -ENOENT; if (get_user(newval, uval)) return -EFAULT; - /* This is also invariant: you can't change it. */ - if (newval != get_ccsidr(val)) - return -EINVAL; - return 0; + return set_ccsidr(vcpu, val, newval); default: return -ENOENT; } } +static u64 kvm_one_reg_to_id(const struct kvm_one_reg *reg) +{ + switch(reg->id) { + case KVM_REG_ARM_TIMER_CVAL: + return TO_ARM64_SYS_REG(CNTV_CVAL_EL0); + case KVM_REG_ARM_TIMER_CNT: + return TO_ARM64_SYS_REG(CNTVCT_EL0); + default: + return reg->id; + } +} + int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num) { u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; + u64 id = kvm_one_reg_to_id(reg); u64 val; int ret; - r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r) + r = id_to_sys_reg_desc(vcpu, id, table, num); + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) { @@ -2883,14 +5347,9 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; - int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) - return demux_c15_get(reg->id, uaddr); - - err = get_invariant_sys_reg(reg->id, uaddr); - if (err != -ENOENT) - return err; + return demux_c15_get(vcpu, reg->id, uaddr); return kvm_sys_reg_get_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); @@ -2901,14 +5360,15 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, { u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; + u64 id = kvm_one_reg_to_id(reg); u64 val; int ret; if (get_user(val, uaddr)) return -EFAULT; - r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r) + r = id_to_sys_reg_desc(vcpu, id, table, num); + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (sysreg_user_write_ignore(vcpu, r)) @@ -2917,7 +5377,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, if (r->set_user) { ret = (r->set_user)(vcpu, r, val); } else { - __vcpu_sys_reg(vcpu, r->reg) = val; + __vcpu_assign_sys_reg(vcpu, r->reg, val); ret = 0; } @@ -2927,14 +5387,9 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; - int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) - return demux_c15_set(reg->id, uaddr); - - err = set_invariant_sys_reg(reg->id, uaddr); - if (err != -ENOENT) - return err; + return demux_c15_set(vcpu, reg->id, uaddr); return kvm_sys_reg_set_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); @@ -2942,13 +5397,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg static unsigned int num_demux_regs(void) { - unsigned int i, count = 0; - - for (i = 0; i < CSSELR_MAX; i++) - if (is_valid_cache(i)) - count++; - - return count; + return CSSELR_MAX; } static int write_demux_regids(u64 __user *uindices) @@ -2958,8 +5407,6 @@ static int write_demux_regids(u64 __user *uindices) val |= KVM_REG_ARM_DEMUX_ID_CCSIDR; for (i = 0; i < CSSELR_MAX; i++) { - if (!is_valid_cache(i)) - continue; if (put_user(val | i, uindices)) return -EFAULT; uindices++; @@ -2980,10 +5427,23 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg) static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) { + u64 idx; + if (!*uind) return true; - if (put_user(sys_reg_to_index(reg), *uind)) + switch (reg_to_encoding(reg)) { + case SYS_CNTV_CVAL_EL0: + idx = KVM_REG_ARM_TIMER_CVAL; + break; + case SYS_CNTVCT_EL0: + idx = KVM_REG_ARM_TIMER_CNT; + break; + default: + idx = sys_reg_to_index(reg); + } + + if (put_user(idx, *uind)) return false; (*uind)++; @@ -3032,23 +5492,14 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu) { - return ARRAY_SIZE(invariant_sys_regs) - + num_demux_regs() + return num_demux_regs() + walk_sys_regs(vcpu, (u64 __user *)NULL); } int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - unsigned int i; int err; - /* Then give them all the invariant registers' indices. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) { - if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) - return -EFAULT; - uindices++; - } - err = walk_sys_regs(vcpu, uindices); if (err < 0) return err; @@ -3057,44 +5508,179 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } -int kvm_sys_reg_table_init(void) +#define KVM_ARM_FEATURE_ID_RANGE_INDEX(r) \ + KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(r), \ + sys_reg_Op1(r), \ + sys_reg_CRn(r), \ + sys_reg_CRm(r), \ + sys_reg_Op2(r)) + +int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range) { + const void *zero_page = page_to_virt(ZERO_PAGE(0)); + u64 __user *masks = (u64 __user *)range->addr; + + /* Only feature id range is supported, reserved[13] must be zero. */ + if (range->range || + memcmp(range->reserved, zero_page, sizeof(range->reserved))) + return -EINVAL; + + /* Wipe the whole thing first */ + if (clear_user(masks, KVM_ARM_FEATURE_ID_RANGE_SIZE * sizeof(__u64))) + return -EFAULT; + + for (int i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *reg = &sys_reg_descs[i]; + u32 encoding = reg_to_encoding(reg); + u64 val; + + if (!is_feature_id_reg(encoding) || !reg->set_user) + continue; + + if (!reg->val || + (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) { + continue; + } + val = reg->val; + + if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding)))) + return -EFAULT; + } + + return 0; +} + +static void vcpu_set_hcr(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + if (has_vhe() || has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_el1_is_32bit(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_RW; + + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; + + /* + * In the absence of FGT, we cannot independently trap TLBI + * Range instructions. This isn't great, but trapping all + * TLBIs would be far worse. Live with it... + */ + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + vcpu->arch.hcr_el2 |= HCR_TTLBOS; +} + +void kvm_calculate_traps(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->arch.config_lock); + vcpu_set_hcr(vcpu); + vcpu_set_ich_hcr(vcpu); + vcpu_set_hcrx(vcpu); + + if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) + goto out; + + compute_fgu(kvm, HFGRTR_GROUP); + compute_fgu(kvm, HFGITR_GROUP); + compute_fgu(kvm, HDFGRTR_GROUP); + compute_fgu(kvm, HAFGRTR_GROUP); + compute_fgu(kvm, HFGRTR2_GROUP); + compute_fgu(kvm, HFGITR2_GROUP); + compute_fgu(kvm, HDFGRTR2_GROUP); + + set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); +out: + mutex_unlock(&kvm->arch.config_lock); +} + +/* + * Perform last adjustments to the ID registers that are implied by the + * configuration outside of the ID regs themselves, as well as any + * initialisation that directly depend on these ID registers (such as + * RES0/RES1 behaviours). This is not the place to configure traps though. + * + * Because this can be called once per CPU, changes must be idempotent. + */ +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + guard(mutex)(&kvm->arch.config_lock); + + /* + * This hacks into the ID registers, so only perform it when the + * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream. + */ + if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) { + u64 val; + + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); + val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val); + } + + if (vcpu_has_nv(vcpu)) { + int ret = kvm_init_nv_sysregs(vcpu); + if (ret) + return ret; + } + + return 0; +} + +int __init kvm_sys_reg_table_init(void) +{ + const struct sys_reg_desc *gicv3_regs; bool valid = true; - unsigned int i; - struct sys_reg_desc clidr; + unsigned int i, sz; + int ret = 0; /* Make sure tables are unique and in order. */ - valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false); - valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true); - valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true); - valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true); - valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true); - valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false); + valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), true); + valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), false); + valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), false); + valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), false); + valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), false); + valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false); + + gicv3_regs = vgic_v3_get_sysreg_table(&sz); + valid &= check_sysreg_table(gicv3_regs, sz, false); if (!valid) return -EINVAL; - /* We abuse the reset function to overwrite the table itself. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) - invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); + init_imp_id_regs(); - /* - * CLIDR format is awkward, so clean it up. See ARM B4.1.20: - * - * If software reads the Cache Type fields from Ctype1 - * upwards, once it has seen a value of 0b000, no caches - * exist at further-out levels of the hierarchy. So, for - * example, if Ctype3 is the first Cache Type field with a - * value of 0b000, the values of Ctype4 to Ctype7 must be - * ignored. - */ - get_clidr_el1(NULL, &clidr); /* Ugly... */ - cache_levels = clidr.val; - for (i = 0; i < 7; i++) - if (((cache_levels >> (i*3)) & 7) == 0) - break; - /* Clear all higher bits. */ - cache_levels &= (1 << (i*3))-1; + ret = populate_nv_trap_config(); - return 0; + check_feature_map(); + + for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++) + ret = populate_sysreg_config(sys_reg_descs + i, i); + + for (i = 0; !ret && i < ARRAY_SIZE(sys_insn_descs); i++) + ret = populate_sysreg_config(sys_insn_descs + i, i); + + return ret; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index e4ebb3a379fd..b3f904472fac 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -27,6 +27,13 @@ struct sys_reg_params { bool is_write; }; +#define encoding_to_params(reg) \ + ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \ + .Op1 = sys_reg_Op1(reg), \ + .CRn = sys_reg_CRn(reg), \ + .CRm = sys_reg_CRm(reg), \ + .Op2 = sys_reg_Op2(reg) }) + #define esr_sys64_to_params(esr) \ ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ .Op1 = ((esr) >> 14) & 0x7, \ @@ -64,13 +71,16 @@ struct sys_reg_desc { struct sys_reg_params *, const struct sys_reg_desc *); - /* Initialization for vcpu. */ - void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); + /* + * Initialization for vcpu. Return initialized value, or KVM + * sanitized value for ID registers. + */ + u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); /* Index into sys_reg[], or 0 if we don't need to save it. */ int reg; - /* Value (usually reset value) */ + /* Value (usually reset value), or write mask for idregs */ u64 val; /* Custom get/set_user functions, fallback to generic if NULL */ @@ -98,7 +108,7 @@ inline void print_sys_reg_msg(const struct sys_reg_params *p, /* Look, we even formatted it for you to paste into the table! */ kvm_pr_unimpl("%pV { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n", &(struct va_format){ fmt, &va }, - p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read"); + p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, str_write_read(p->is_write)); va_end(va); } @@ -122,19 +132,21 @@ static inline bool read_zero(struct kvm_vcpu *vcpu, } /* Reset functions */ -static inline void reset_unknown(struct kvm_vcpu *vcpu, +static inline u64 reset_unknown(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); - __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; + __vcpu_assign_sys_reg(vcpu, r->reg, 0x1de7ec7edbadc0deULL); + return __vcpu_sys_reg(vcpu, r->reg); } -static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); - __vcpu_sys_reg(vcpu, r->reg) = r->val; + __vcpu_assign_sys_reg(vcpu, r->reg, r->val); + return __vcpu_sys_reg(vcpu, r->reg); } static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu, @@ -211,6 +223,10 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num); +bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); + +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); + #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x @@ -224,4 +240,27 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) +#define CP15_SYS_DESC(reg) \ + .name = #reg, \ + .aarch32_map = AA32_DIRECT, \ + Op0(0), Op1(sys_reg_Op1(reg)), \ + CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ + Op2(sys_reg_Op2(reg)) + +#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ +({ \ + u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ + (val) &= ~reg##_##field##_MASK; \ + (val) |= FIELD_PREP(reg##_##field##_MASK, \ + min(__f_val, \ + (u64)SYS_FIELD_VALUE(reg, field, limit))); \ + (val); \ +}) + +#define TO_ARM64_SYS_REG(r) ARM64_SYS_REG(sys_reg_Op0(SYS_ ## r), \ + sys_reg_Op1(SYS_ ## r), \ + sys_reg_CRn(SYS_ ## r), \ + sys_reg_CRm(SYS_ ## r), \ + sys_reg_Op2(SYS_ ## r)) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index 33e4e7dd2719..9c60f6465c78 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -2,6 +2,7 @@ #if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ARM_ARM64_KVM_H +#include <asm/kvm_emulate.h> #include <kvm/arm_arch_timer.h> #include <linux/tracepoint.h> @@ -135,6 +136,31 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); +TRACE_EVENT(kvm_mmio_nisv, + TP_PROTO(unsigned long vcpu_pc, unsigned long esr, + unsigned long far, unsigned long ipa), + TP_ARGS(vcpu_pc, esr, far, ipa), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, esr ) + __field( unsigned long, far ) + __field( unsigned long, ipa ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->esr = esr; + __entry->far = far; + __entry->ipa = ipa; + ), + + TP_printk("ipa %#016lx, esr %#016lx, far %#016lx, pc %#016lx", + __entry->ipa, __entry->esr, + __entry->far, __entry->vcpu_pc) +); + + TRACE_EVENT(kvm_set_way_flush, TP_PROTO(unsigned long vcpu_pc, bool cache), TP_ARGS(vcpu_pc, cache), @@ -150,7 +176,7 @@ TRACE_EVENT(kvm_set_way_flush, ), TP_printk("S/W flush at 0x%016lx (cache %s)", - __entry->vcpu_pc, __entry->cache ? "on" : "off") + __entry->vcpu_pc, str_on_off(__entry->cache)) ); TRACE_EVENT(kvm_toggle_cache, @@ -170,8 +196,8 @@ TRACE_EVENT(kvm_toggle_cache, ), TP_printk("VM op at 0x%016lx (cache was %s, now %s)", - __entry->vcpu_pc, __entry->was ? "on" : "off", - __entry->now ? "on" : "off") + __entry->vcpu_pc, str_on_off(__entry->was), + str_on_off(__entry->now)) ); /* @@ -205,6 +231,7 @@ TRACE_EVENT(kvm_get_timer_map, __field( unsigned long, vcpu_id ) __field( int, direct_vtimer ) __field( int, direct_ptimer ) + __field( int, emul_vtimer ) __field( int, emul_ptimer ) ), @@ -213,14 +240,17 @@ TRACE_EVENT(kvm_get_timer_map, __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer); __entry->direct_ptimer = (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1; + __entry->emul_vtimer = + (map->emul_vtimer) ? arch_timer_ctx_index(map->emul_vtimer) : -1; __entry->emul_ptimer = (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1; ), - TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d", + TP_printk("VCPU: %ld, dv: %d, dp: %d, ev: %d, ep: %d", __entry->vcpu_id, __entry->direct_vtimer, __entry->direct_ptimer, + __entry->emul_vtimer, __entry->emul_ptimer) ); @@ -301,6 +331,90 @@ TRACE_EVENT(kvm_timer_emulate, __entry->timer_idx, __entry->should_fire) ); +TRACE_EVENT(kvm_nested_eret, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned long elr_el2, + unsigned long spsr_el2), + TP_ARGS(vcpu, elr_el2, spsr_el2), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(unsigned long, elr_el2) + __field(unsigned long, spsr_el2) + __field(unsigned long, target_mode) + __field(unsigned long, hcr_el2) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->elr_el2 = elr_el2; + __entry->spsr_el2 = spsr_el2; + __entry->target_mode = spsr_el2 & (PSR_MODE_MASK | PSR_MODE32_BIT); + __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2); + ), + + TP_printk("elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx", + __entry->elr_el2, __entry->spsr_el2, + __print_symbolic(__entry->target_mode, kvm_mode_names), + __entry->hcr_el2) +); + +TRACE_EVENT(kvm_inject_nested_exception, + TP_PROTO(struct kvm_vcpu *vcpu, u64 esr_el2, int type), + TP_ARGS(vcpu, esr_el2, type), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(unsigned long, esr_el2) + __field(int, type) + __field(unsigned long, spsr_el2) + __field(unsigned long, pc) + __field(unsigned long, source_mode) + __field(unsigned long, hcr_el2) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->esr_el2 = esr_el2; + __entry->type = type; + __entry->spsr_el2 = *vcpu_cpsr(vcpu); + __entry->pc = *vcpu_pc(vcpu); + __entry->source_mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2); + ), + + TP_printk("%s: esr_el2 0x%lx elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx", + __print_symbolic(__entry->type, kvm_exception_type_names), + __entry->esr_el2, __entry->pc, __entry->spsr_el2, + __print_symbolic(__entry->source_mode, kvm_mode_names), + __entry->hcr_el2) +); + +TRACE_EVENT(kvm_forward_sysreg_trap, + TP_PROTO(struct kvm_vcpu *vcpu, u32 sysreg, bool is_read), + TP_ARGS(vcpu, sysreg, is_read), + + TP_STRUCT__entry( + __field(u64, pc) + __field(u32, sysreg) + __field(bool, is_read) + ), + + TP_fast_assign( + __entry->pc = *vcpu_pc(vcpu); + __entry->sysreg = sysreg; + __entry->is_read = is_read; + ), + + TP_printk("%llx %c (%d,%d,%d,%d,%d)", + __entry->pc, + __entry->is_read ? 'R' : 'W', + sys_reg_Op0(__entry->sysreg), + sys_reg_Op1(__entry->sysreg), + sys_reg_CRn(__entry->sysreg), + sys_reg_CRm(__entry->sysreg), + sys_reg_Op2(__entry->sysreg)) +); + #endif /* _TRACE_ARM_ARM64_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 064a58c19f48..a7ab9a3bbed0 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -46,38 +46,6 @@ TRACE_EVENT(kvm_hvc_arm64, __entry->vcpu_pc, __entry->r0, __entry->imm) ); -TRACE_EVENT(kvm_arm_setup_debug, - TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), - TP_ARGS(vcpu, guest_debug), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->guest_debug = guest_debug; - ), - - TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) -); - -TRACE_EVENT(kvm_arm_clear_debug, - TP_PROTO(__u32 guest_debug), - TP_ARGS(guest_debug), - - TP_STRUCT__entry( - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->guest_debug = guest_debug; - ), - - TP_printk("flags: 0x%08x", __entry->guest_debug) -); - /* * The dreg32 name is a leftover from a distant past. This will really * output a 64bit value... @@ -99,49 +67,6 @@ TRACE_EVENT(kvm_arm_set_dreg32, TP_printk("%s: 0x%llx", __entry->name, __entry->value) ); -TRACE_DEFINE_SIZEOF(__u64); - -TRACE_EVENT(kvm_arm_set_regset, - TP_PROTO(const char *type, int len, __u64 *control, __u64 *value), - TP_ARGS(type, len, control, value), - TP_STRUCT__entry( - __field(const char *, name) - __field(int, len) - __array(u64, ctrls, 16) - __array(u64, values, 16) - ), - TP_fast_assign( - __entry->name = type; - __entry->len = len; - memcpy(__entry->ctrls, control, len << 3); - memcpy(__entry->values, value, len << 3); - ), - TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name, - __print_array(__entry->ctrls, __entry->len, sizeof(__u64)), - __print_array(__entry->values, __entry->len, sizeof(__u64))) -); - -TRACE_EVENT(trap_reg, - TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value), - TP_ARGS(fn, reg, is_write, write_value), - - TP_STRUCT__entry( - __field(const char *, fn) - __field(int, reg) - __field(bool, is_write) - __field(u64, write_value) - ), - - TP_fast_assign( - __entry->fn = fn; - __entry->reg = reg; - __entry->is_write = is_write; - __entry->write_value = write_value; - ), - - TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) -); - TRACE_EVENT(kvm_handle_sys_reg, TP_PROTO(unsigned long hsr), TP_ARGS(hsr), @@ -188,7 +113,7 @@ TRACE_EVENT(kvm_sys_access, __entry->vcpu_pc, __entry->name ?: "UNKN", __entry->Op0, __entry->Op1, __entry->CRn, __entry->CRm, __entry->Op2, - __entry->is_write ? "write" : "read") + str_write_read(__entry->is_write)) ); TRACE_EVENT(kvm_set_guest_debug, diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 9e7c486b48c2..bdc2d57370b2 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -35,12 +35,12 @@ static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, vgic_v3_cpu->num_id_bits = host_id_bits; - host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2); + host_seis = FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2); seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val); if (host_seis != seis) return -EINVAL; - host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2); + host_a3v = FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2); a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val); if (host_a3v != a3v) return -EINVAL; @@ -68,10 +68,10 @@ static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1); val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits); val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK, - FIELD_GET(ICH_VTR_SEIS_MASK, + FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2)); val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK, - FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2)); + FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2)); /* * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. * Extract it directly using ICC_CTLR_EL1 reg definitions. @@ -297,6 +297,91 @@ static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, return 0; } +static int set_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + __vcpu_assign_sys_reg(vcpu, r->reg, val); + return 0; +} + +static int get_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = __vcpu_sys_reg(vcpu, r->reg); + return 0; +} + +static int set_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + return set_gic_ich_reg(vcpu, r, val); +} + +static int get_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + return get_gic_ich_reg(vcpu, r, val); +} + +static int set_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + if (val != KVM_ICC_SRE_EL2) + return -EINVAL; + return 0; +} + +static int get_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = KVM_ICC_SRE_EL2; + return 0; +} + +static int set_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + if (val != kvm_get_guest_vtr_el2()) + return -EINVAL; + return 0; +} + +static int get_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = kvm_get_guest_vtr_el2(); + return 0; +} + +static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return vcpu_has_nv(vcpu) ? 0 : REG_HIDDEN; +} + +#define __EL2_REG(r, acc, i) \ + { \ + SYS_DESC(SYS_ ## r), \ + .get_user = get_gic_ ## acc, \ + .set_user = set_gic_ ## acc, \ + .reg = i, \ + .visibility = el2_visibility, \ + } + +#define EL2_REG(r, acc) __EL2_REG(r, acc, r) + +#define EL2_REG_RO(r, acc) __EL2_REG(r, acc, 0) + static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { { SYS_DESC(SYS_ICC_PMR_EL1), .set_user = set_gic_pmr, .get_user = get_gic_pmr, }, @@ -328,8 +413,42 @@ static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { .set_user = set_gic_grpen0, .get_user = get_gic_grpen0, }, { SYS_DESC(SYS_ICC_IGRPEN1_EL1), .set_user = set_gic_grpen1, .get_user = get_gic_grpen1, }, + EL2_REG(ICH_AP0R0_EL2, ich_apr), + EL2_REG(ICH_AP0R1_EL2, ich_apr), + EL2_REG(ICH_AP0R2_EL2, ich_apr), + EL2_REG(ICH_AP0R3_EL2, ich_apr), + EL2_REG(ICH_AP1R0_EL2, ich_apr), + EL2_REG(ICH_AP1R1_EL2, ich_apr), + EL2_REG(ICH_AP1R2_EL2, ich_apr), + EL2_REG(ICH_AP1R3_EL2, ich_apr), + EL2_REG_RO(ICC_SRE_EL2, icc_sre), + EL2_REG(ICH_HCR_EL2, ich_reg), + EL2_REG_RO(ICH_VTR_EL2, ich_vtr), + EL2_REG(ICH_VMCR_EL2, ich_reg), + EL2_REG(ICH_LR0_EL2, ich_reg), + EL2_REG(ICH_LR1_EL2, ich_reg), + EL2_REG(ICH_LR2_EL2, ich_reg), + EL2_REG(ICH_LR3_EL2, ich_reg), + EL2_REG(ICH_LR4_EL2, ich_reg), + EL2_REG(ICH_LR5_EL2, ich_reg), + EL2_REG(ICH_LR6_EL2, ich_reg), + EL2_REG(ICH_LR7_EL2, ich_reg), + EL2_REG(ICH_LR8_EL2, ich_reg), + EL2_REG(ICH_LR9_EL2, ich_reg), + EL2_REG(ICH_LR10_EL2, ich_reg), + EL2_REG(ICH_LR11_EL2, ich_reg), + EL2_REG(ICH_LR12_EL2, ich_reg), + EL2_REG(ICH_LR13_EL2, ich_reg), + EL2_REG(ICH_LR14_EL2, ich_reg), + EL2_REG(ICH_LR15_EL2, ich_reg), }; +const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz) +{ + *sz = ARRAY_SIZE(gic_v3_icc_reg_descs); + return gic_v3_icc_reg_descs; +} + static u64 attr_to_id(u64 attr) { return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr), @@ -341,8 +460,12 @@ static u64 attr_to_id(u64 attr) int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - if (get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs, - ARRAY_SIZE(gic_v3_icc_reg_descs))) + const struct sys_reg_desc *r; + + r = get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); + + if (r && !sysreg_hidden(vcpu, r)) return 0; return -ENXIO; diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 78cde687383c..bb92853d1fd3 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -28,27 +28,74 @@ struct vgic_state_iter { int nr_lpis; int dist_id; int vcpu_id; - int intid; + unsigned long intid; int lpi_idx; - u32 *lpi_array; }; -static void iter_next(struct vgic_state_iter *iter) +static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) { + struct vgic_dist *dist = &kvm->arch.vgic; + if (iter->dist_id == 0) { iter->dist_id++; return; } + /* + * Let the xarray drive the iterator after the last SPI, as the iterator + * has exhausted the sequentially-allocated INTID space. + */ + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) && + iter->nr_lpis) { + if (iter->lpi_idx < iter->nr_lpis) + xa_find_after(&dist->lpi_xa, &iter->intid, + VGIC_LPI_MAX_INTID, + LPI_XA_MARK_DEBUG_ITER); + iter->lpi_idx++; + return; + } + iter->intid++; if (iter->intid == VGIC_NR_PRIVATE_IRQS && ++iter->vcpu_id < iter->nr_cpus) iter->intid = 0; +} - if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) { - if (iter->lpi_idx < iter->nr_lpis) - iter->intid = iter->lpi_array[iter->lpi_idx]; - iter->lpi_idx++; +static int iter_mark_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid, flags; + struct vgic_irq *irq; + int nr_lpis = 0; + + xa_lock_irqsave(&dist->lpi_xa, flags); + + xa_for_each(&dist->lpi_xa, intid, irq) { + if (!vgic_try_get_irq_ref(irq)) + continue; + + __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + nr_lpis++; + } + + xa_unlock_irqrestore(&dist->lpi_xa, flags); + + return nr_lpis; +} + +static void iter_unmark_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid, flags; + struct vgic_irq *irq; + + xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { + xa_lock_irqsave(&dist->lpi_xa, flags); + __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + xa_unlock_irqrestore(&dist->lpi_xa, flags); + + /* vgic_put_irq() expects to be called outside of the xa_lock */ + vgic_put_irq(kvm, irq); } } @@ -61,15 +108,12 @@ static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, iter->nr_cpus = nr_cpus; iter->nr_spis = kvm->arch.vgic.nr_spis; - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array); - if (iter->nr_lpis < 0) - iter->nr_lpis = 0; - } + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + iter->nr_lpis = iter_mark_lpis(kvm); /* Fast forward to the right position if needed */ while (pos--) - iter_next(iter); + iter_next(kvm, iter); } static bool end_of_vgic(struct vgic_state_iter *iter) @@ -77,7 +121,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter) return iter->dist_id > 0 && iter->vcpu_id == iter->nr_cpus && iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && - iter->lpi_idx > iter->nr_lpis; + (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis); } static void *vgic_debug_start(struct seq_file *s, loff_t *pos) @@ -85,7 +129,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos) struct kvm *kvm = s->private; struct vgic_state_iter *iter; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; if (iter) { iter = ERR_PTR(-EBUSY); @@ -104,7 +148,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos) if (end_of_vgic(iter)) iter = NULL; out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); return iter; } @@ -114,7 +158,7 @@ static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) struct vgic_state_iter *iter = kvm->arch.vgic.iter; ++*pos; - iter_next(iter); + iter_next(kvm, iter); if (end_of_vgic(iter)) iter = NULL; return iter; @@ -132,15 +176,16 @@ static void vgic_debug_stop(struct seq_file *s, void *v) if (IS_ERR(v)) return; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; - kfree(iter->lpi_array); + iter_unmark_lpis(kvm); kfree(iter); kvm->arch.vgic.iter = NULL; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); } -static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) +static void print_dist_state(struct seq_file *s, struct vgic_dist *dist, + struct vgic_state_iter *iter) { bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; @@ -149,7 +194,7 @@ static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); if (v3) - seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count); + seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis); seq_printf(s, "enabled:\t%d\n", dist->enabled); seq_printf(s, "\n"); @@ -166,7 +211,7 @@ static void print_header(struct seq_file *s, struct vgic_irq *irq, if (vcpu) { hdr = "VCPU"; - id = vcpu->vcpu_id; + id = vcpu->vcpu_idx; } seq_printf(s, "\n"); @@ -212,7 +257,7 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, " %2d " "\n", type, irq->intid, - (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1, + (irq->target_vcpu) ? irq->target_vcpu->vcpu_idx : -1, pending, irq->line_level, irq->active, @@ -224,7 +269,7 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, irq->mpidr, irq->source, irq->priority, - (irq->vcpu) ? irq->vcpu->vcpu_id : -1); + (irq->vcpu) ? irq->vcpu->vcpu_idx : -1); } static int vgic_debug_show(struct seq_file *s, void *v) @@ -236,7 +281,7 @@ static int vgic_debug_show(struct seq_file *s, void *v) unsigned long flags; if (iter->dist_id == 0) { - print_dist_state(s, &kvm->arch.vgic); + print_dist_state(s, &kvm->arch.vgic, iter); return 0; } @@ -246,11 +291,16 @@ static int vgic_debug_show(struct seq_file *s, void *v) if (iter->vcpu_id < iter->nr_cpus) vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); - irq = vgic_get_irq(kvm, vcpu, iter->intid); - if (!irq) { - seq_printf(s, " LPI %4d freed\n", iter->intid); - return 0; - } + /* + * Expect this to succeed, as iter_mark_lpis() takes a reference on + * every LPI to be visited. + */ + if (iter->intid < VGIC_NR_PRIVATE_IRQS) + irq = vgic_get_vcpu_irq(vcpu, iter->intid); + else + irq = vgic_get_irq(kvm, iter->intid); + if (WARN_ON_ONCE(!irq)) + return -EINVAL; raw_spin_lock_irqsave(&irq->irq_lock, flags); print_irq_state(s, irq, vcpu); @@ -278,3 +328,230 @@ void vgic_debug_init(struct kvm *kvm) void vgic_debug_destroy(struct kvm *kvm) { } + +/** + * struct vgic_its_iter - Iterator for traversing VGIC ITS device tables. + * @dev: Pointer to the current its_device being processed. + * @ite: Pointer to the current its_ite within the device being processed. + * + * This structure is used to maintain the current position during iteration + * over the ITS device tables. It holds pointers to both the current device + * and the current ITE within that device. + */ +struct vgic_its_iter { + struct its_device *dev; + struct its_ite *ite; +}; + +/** + * end_of_iter - Checks if the iterator has reached the end. + * @iter: The iterator to check. + * + * When the iterator completed processing the final ITE in the last device + * table, it was marked to indicate the end of iteration by setting its + * device and ITE pointers to NULL. + * This function checks whether the iterator was marked as end. + * + * Return: True if the iterator is marked as end, false otherwise. + */ +static inline bool end_of_iter(struct vgic_its_iter *iter) +{ + return !iter->dev && !iter->ite; +} + +/** + * vgic_its_iter_next - Advances the iterator to the next entry in the ITS tables. + * @its: The VGIC ITS structure. + * @iter: The iterator to advance. + * + * This function moves the iterator to the next ITE within the current device, + * or to the first ITE of the next device if the current ITE is the last in + * the device. If the current device is the last device, the iterator is set + * to indicate the end of iteration. + */ +static void vgic_its_iter_next(struct vgic_its *its, struct vgic_its_iter *iter) +{ + struct its_device *dev = iter->dev; + struct its_ite *ite = iter->ite; + + if (!ite || list_is_last(&ite->ite_list, &dev->itt_head)) { + if (list_is_last(&dev->dev_list, &its->device_list)) { + dev = NULL; + ite = NULL; + } else { + dev = list_next_entry(dev, dev_list); + ite = list_first_entry_or_null(&dev->itt_head, + struct its_ite, + ite_list); + } + } else { + ite = list_next_entry(ite, ite_list); + } + + iter->dev = dev; + iter->ite = ite; +} + +/** + * vgic_its_debug_start - Start function for the seq_file interface. + * @s: The seq_file structure. + * @pos: The starting position (offset). + * + * This function initializes the iterator to the beginning of the ITS tables + * and advances it to the specified position. It acquires the its_lock mutex + * to protect shared data. + * + * Return: An iterator pointer on success, NULL if no devices are found or + * the end of the list is reached, or ERR_PTR(-ENOMEM) on memory + * allocation failure. + */ +static void *vgic_its_debug_start(struct seq_file *s, loff_t *pos) +{ + struct vgic_its *its = s->private; + struct vgic_its_iter *iter; + struct its_device *dev; + loff_t offset = *pos; + + mutex_lock(&its->its_lock); + + dev = list_first_entry_or_null(&its->device_list, + struct its_device, dev_list); + if (!dev) + return NULL; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return ERR_PTR(-ENOMEM); + + iter->dev = dev; + iter->ite = list_first_entry_or_null(&dev->itt_head, + struct its_ite, ite_list); + + while (!end_of_iter(iter) && offset--) + vgic_its_iter_next(its, iter); + + if (end_of_iter(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +/** + * vgic_its_debug_next - Next function for the seq_file interface. + * @s: The seq_file structure. + * @v: The current iterator. + * @pos: The current position (offset). + * + * This function advances the iterator to the next entry and increments the + * position. + * + * Return: An iterator pointer on success, or NULL if the end of the list is + * reached. + */ +static void *vgic_its_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct vgic_its *its = s->private; + struct vgic_its_iter *iter = v; + + ++*pos; + vgic_its_iter_next(its, iter); + + if (end_of_iter(iter)) { + kfree(iter); + return NULL; + } + return iter; +} + +/** + * vgic_its_debug_stop - Stop function for the seq_file interface. + * @s: The seq_file structure. + * @v: The current iterator. + * + * This function frees the iterator and releases the its_lock mutex. + */ +static void vgic_its_debug_stop(struct seq_file *s, void *v) +{ + struct vgic_its *its = s->private; + struct vgic_its_iter *iter = v; + + if (!IS_ERR_OR_NULL(iter)) + kfree(iter); + mutex_unlock(&its->its_lock); +} + +/** + * vgic_its_debug_show - Show function for the seq_file interface. + * @s: The seq_file structure. + * @v: The current iterator. + * + * This function formats and prints the ITS table entry information to the + * seq_file output. + * + * Return: 0 on success. + */ +static int vgic_its_debug_show(struct seq_file *s, void *v) +{ + struct vgic_its_iter *iter = v; + struct its_device *dev = iter->dev; + struct its_ite *ite = iter->ite; + + if (!ite) + return 0; + + if (list_is_first(&ite->ite_list, &dev->itt_head)) { + seq_printf(s, "\n"); + seq_printf(s, "Device ID: 0x%x, Event ID Range: [0 - %llu]\n", + dev->device_id, BIT_ULL(dev->num_eventid_bits) - 1); + seq_printf(s, "EVENT_ID INTID HWINTID TARGET COL_ID HW\n"); + seq_printf(s, "-----------------------------------------------\n"); + } + + if (ite->irq && ite->collection) { + seq_printf(s, "%8u %8u %8u %8u %8u %2d\n", + ite->event_id, ite->irq->intid, ite->irq->hwintid, + ite->collection->target_addr, + ite->collection->collection_id, ite->irq->hw); + } + + return 0; +} + +static const struct seq_operations vgic_its_debug_sops = { + .start = vgic_its_debug_start, + .next = vgic_its_debug_next, + .stop = vgic_its_debug_stop, + .show = vgic_its_debug_show +}; + +DEFINE_SEQ_ATTRIBUTE(vgic_its_debug); + +/** + * vgic_its_debug_init - Initializes the debugfs interface for VGIC ITS. + * @dev: The KVM device structure. + * + * This function creates a debugfs file named "vgic-its-state@%its_base" + * to expose the ITS table information. + * + * Return: 0 on success. + */ +int vgic_its_debug_init(struct kvm_device *dev) +{ + struct vgic_its *its = dev->private; + char *name; + + name = kasprintf(GFP_KERNEL, "vgic-its-state@%llx", (u64)its->vgic_its_base); + if (!name) + return -ENOMEM; + + debugfs_create_file(name, 0444, dev->kvm->debugfs_dentry, its, &vgic_its_debug_fops); + + kfree(name); + return 0; +} + +void vgic_its_debug_destroy(struct kvm_device *dev) +{ +} diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index f6d4f4052555..dc9f9db31026 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -34,9 +34,9 @@ * * CPU Interface: * - * - kvm_vgic_vcpu_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. + * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend + * on any sizing information. Private interrupts are allocated if not + * already allocated at vgic-creation time. */ /* EARLY INIT */ @@ -53,13 +53,13 @@ void kvm_vgic_early_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - INIT_LIST_HEAD(&dist->lpi_list_head); - INIT_LIST_HEAD(&dist->lpi_translation_cache); - raw_spin_lock_init(&dist->lpi_list_lock); + xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); } /* CREATION */ +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); + /** * kvm_vgic_create: triggered by the instantiation of the VGIC device by * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) @@ -71,12 +71,10 @@ void kvm_vgic_early_init(struct kvm *kvm) int kvm_vgic_create(struct kvm *kvm, u32 type) { struct kvm_vcpu *vcpu; + u64 aa64pfr0, pfr1; unsigned long i; int ret; - if (irqchip_in_kernel(kvm)) - return -EEXIST; - /* * This function is also called by the KVM_CREATE_IRQCHIP handler, * which had no chance yet to check the availability of the GICv2 @@ -87,10 +85,45 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) !kvm_vgic_global_state.can_emulate_gicv2) return -ENODEV; + /* + * Ensure mutual exclusion with vCPU creation and any vCPU ioctls by: + * + * - Holding kvm->lock to prevent KVM_CREATE_VCPU from reaching + * kvm_arch_vcpu_precreate() and ensuring created_vcpus is stable. + * This alone is insufficient, as kvm_vm_ioctl_create_vcpu() drops + * the kvm->lock before completing the vCPU creation. + */ + lockdep_assert_held(&kvm->lock); + + /* + * - Acquiring the vCPU mutex for every *online* vCPU to prevent + * concurrent vCPU ioctls for vCPUs already visible to userspace. + */ ret = -EBUSY; - if (!lock_all_vcpus(kvm)) + if (kvm_trylock_all_vcpus(kvm)) return ret; + /* + * - Taking the config_lock which protects VGIC data structures such + * as the per-vCPU arrays of private IRQs (SGIs, PPIs). + */ + mutex_lock(&kvm->arch.config_lock); + + /* + * - Bailing on the entire thing if a vCPU is in the middle of creation, + * dropped the kvm->lock, but hasn't reached kvm_arch_vcpu_create(). + * + * The whole combination of this guarantees that no vCPU can get into + * KVM with a VGIC configuration inconsistent with the VM's VGIC. + */ + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) + goto out_unlock; + + if (irqchip_in_kernel(kvm)) { + ret = -EEXIST; + goto out_unlock; + } + kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu_has_run_once(vcpu)) goto out_unlock; @@ -107,18 +140,48 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) goto out_unlock; } + kvm_for_each_vcpu(i, vcpu, kvm) { + ret = vgic_allocate_private_irqs_locked(vcpu, type); + if (ret) + break; + } + + if (ret) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + } + + goto out_unlock; + } + kvm->arch.vgic.in_kernel = true; kvm->arch.vgic.vgic_model = type; + kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST; kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) { kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - else + } else { INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); + aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); + } + + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); + + if (type == KVM_DEV_TYPE_ARM_VGIC_V3) + kvm->arch.vgic.nassgicap = system_supports_direct_sgis(); out_unlock: - unlock_all_vcpus(kvm); + mutex_unlock(&kvm->arch.config_lock); + kvm_unlock_all_vcpus(kvm); return ret; } @@ -135,6 +198,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; + dist->active_spis = (atomic_t)ATOMIC_INIT(0); dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!dist->spis) return -ENOMEM; @@ -155,7 +219,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) raw_spin_lock_init(&irq->irq_lock); irq->vcpu = NULL; irq->target_vcpu = vcpu0; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); switch (dist->vgic_model) { case KVM_DEV_TYPE_ARM_VGIC_V2: irq->targets = 0; @@ -174,27 +238,43 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) return 0; } -/** - * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data - * structures and register VCPU-specific KVM iodevs - * - * @vcpu: pointer to the VCPU being created and initialized - * - * Only do initialization, but do not actually enable the - * VGIC CPU interface - */ -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +/* Default GICv3 Maintenance Interrupt INTID, as per SBSA */ +#define DEFAULT_MI_INTID 25 + +int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu) +{ + int ret; + + guard(mutex)(&vcpu->kvm->arch.config_lock); + + /* + * Matching the tradition established with the timers, provide + * a default PPI for the maintenance interrupt. It makes + * things easier to reason about. + */ + if (vcpu->kvm->arch.vgic.mi_intid == 0) + vcpu->kvm->arch.vgic.mi_intid = DEFAULT_MI_INTID; + ret = kvm_vgic_set_owner(vcpu, vcpu->kvm->arch.vgic.mi_intid, vcpu); + + return ret; +} + +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int ret = 0; int i; - vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + lockdep_assert_held(&vcpu->kvm->arch.config_lock); - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - raw_spin_lock_init(&vgic_cpu->ap_list_lock); - atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + if (vgic_cpu->private_irqs) + return 0; + + vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS, + sizeof(struct vgic_irq), + GFP_KERNEL_ACCOUNT); + + if (!vgic_cpu->private_irqs) + return -ENOMEM; /* * Enable and configure all SGIs to be edge-triggered and @@ -208,7 +288,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) irq->intid = i; irq->vcpu = NULL; irq->target_vcpu = vcpu; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); if (vgic_irq_is_sgi(i)) { /* SGIs */ irq->enabled = 1; @@ -217,29 +297,79 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) /* PPIs */ irq->config = VGIC_CONFIG_LEVEL; } + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->group = 1; + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->group = 0; + irq->targets = BIT(vcpu->vcpu_id); + break; + } } + return 0; +} + +static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type) +{ + int ret; + + mutex_lock(&vcpu->kvm->arch.config_lock); + ret = vgic_allocate_private_irqs_locked(vcpu, type); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return ret; +} + +/** + * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data + * structures and register VCPU-specific KVM iodevs + * + * @vcpu: pointer to the VCPU being created and initialized + * + * Only do initialization, but do not actually enable the + * VGIC CPU interface + */ +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int ret = 0; + + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + raw_spin_lock_init(&vgic_cpu->ap_list_lock); + atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + if (!irqchip_in_kernel(vcpu->kvm)) return 0; + ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model); + if (ret) + return ret; + /* * If we are creating a VCPU with a GICv3 we must also register the * KVM io device for the redistributor that belongs to this VCPU. */ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->slots_lock); ret = vgic_register_redist_iodev(vcpu); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->slots_lock); } return ret; } -static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) +static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu) { if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_enable(vcpu); + vgic_v2_reset(vcpu); else - vgic_v3_enable(vcpu); + vgic_v3_reset(vcpu); } /* @@ -250,15 +380,16 @@ static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) * The function is generally called when nr_spis has been explicitly set * by the guest through the KVM DEVICE API. If not nr_spis is set to 256. * vgic_initialized() returns true when this function has succeeded. - * Must be called with kvm->lock held! */ int vgic_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int ret = 0, i; + int ret = 0; unsigned long idx; + lockdep_assert_held(&kvm->arch.config_lock); + if (vgic_initialized(kvm)) return 0; @@ -274,59 +405,25 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; - /* Initialize groups on CPUs created before the VGIC type was known */ - kvm_for_each_vcpu(idx, vcpu, kvm) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; - switch (dist->vgic_model) { - case KVM_DEV_TYPE_ARM_VGIC_V3: - irq->group = 1; - irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - break; - case KVM_DEV_TYPE_ARM_VGIC_V2: - irq->group = 0; - irq->targets = 1U << idx; - break; - default: - ret = -EINVAL; - goto out; - } - } - } - - if (vgic_has_its(kvm)) - vgic_lpi_translation_cache_init(kvm); - /* - * If we have GICv4.1 enabled, unconditionnaly request enable the - * v4 support so that we get HW-accelerated vSGIs. Otherwise, only - * enable it if we present a virtual ITS to the guest. + * Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs, + * vLPIs) is supported. */ - if (vgic_supports_direct_msis(kvm)) { + if (vgic_supports_direct_irqs(kvm)) { ret = vgic_v4_init(kvm); if (ret) goto out; } kvm_for_each_vcpu(idx, vcpu, kvm) - kvm_vgic_vcpu_enable(vcpu); + kvm_vgic_vcpu_reset(vcpu); ret = kvm_vgic_setup_default_irq_routing(kvm); if (ret) goto out; vgic_debug_init(kvm); - - /* - * If userspace didn't set the GIC implementation revision, - * default to the latest and greatest. You know want it. - */ - if (!dist->implementation_rev) - dist->implementation_rev = KVM_VGIC_IMP_REV_LATEST; dist->initialized = true; - out: return ret; } @@ -346,20 +443,19 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); INIT_LIST_HEAD(&dist->rd_regions); } else { dist->vgic_cpu_base = VGIC_ADDR_UNDEF; } - if (vgic_has_its(kvm)) - vgic_lpi_translation_cache_destroy(kvm); - - if (vgic_supports_direct_msis(kvm)) + if (vgic_supports_direct_irqs(kvm)) vgic_v4_teardown(kvm); + + xa_destroy(&dist->lpi_xa); } -void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) +static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; @@ -370,33 +466,69 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) vgic_flush_pending_lpis(vcpu); INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + /* + * If this vCPU is being destroyed because of a failed creation + * then unregister the redistributor to avoid leaving behind a + * dangling pointer to the vCPU struct. + * + * vCPUs that have been successfully created (i.e. added to + * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as + * this function gets called while holding kvm->arch.config_lock + * in the VM teardown path and would otherwise introduce a lock + * inversion w.r.t. kvm->srcu. + * + * vCPUs that failed creation are torn down outside of the + * kvm->arch.config_lock and do not get unregistered in + * kvm_vgic_destroy(), meaning it is both safe and necessary to + * do so here. + */ + if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu) + vgic_unregister_redist_iodev(vcpu); + + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + } } -/* To be called with kvm->lock held */ -static void __kvm_vgic_destroy(struct kvm *kvm) +void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->slots_lock); + __kvm_vgic_vcpu_destroy(vcpu); + mutex_unlock(&kvm->slots_lock); +} + +void kvm_vgic_destroy(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; + mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->arch.config_lock); + vgic_debug_destroy(kvm); kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vgic_vcpu_destroy(vcpu); + __kvm_vgic_vcpu_destroy(vcpu); kvm_vgic_dist_destroy(kvm); -} -void kvm_vgic_destroy(struct kvm *kvm) -{ - mutex_lock(&kvm->lock); - __kvm_vgic_destroy(kvm); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); + + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + kvm_for_each_vcpu(i, vcpu, kvm) + vgic_unregister_redist_iodev(vcpu); + + mutex_unlock(&kvm->slots_lock); } /** * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest - * is a GICv2. A GICv3 must be explicitly initialized by the guest using the + * is a GICv2. A GICv3 must be explicitly initialized by userspace using the * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group. * @kvm: kvm struct pointer */ @@ -414,9 +546,9 @@ int vgic_lazy_init(struct kvm *kvm) if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) return -EBUSY; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); ret = vgic_init(kvm); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); } return ret; @@ -425,67 +557,95 @@ int vgic_lazy_init(struct kvm *kvm) /* RESOURCE MAPPING */ /** + * kvm_vgic_map_resources - map the MMIO regions + * @kvm: kvm struct pointer + * * Map the MMIO regions depending on the VGIC model exposed to the guest * called on the first VCPU run. * Also map the virtual CPU interface into the VM. * v2 calls vgic_init() if not already done. * v3 and derivatives return an error if the VGIC is not initialized. - * vgic_ready() returns true if this function has succeeded. - * @kvm: kvm struct pointer */ int kvm_vgic_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + enum vgic_type type; + gpa_t dist_base; int ret = 0; - if (likely(vgic_ready(kvm))) + if (likely(smp_load_acquire(&dist->ready))) return 0; - mutex_lock(&kvm->lock); - if (vgic_ready(kvm)) + mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->arch.config_lock); + if (dist->ready) goto out; if (!irqchip_in_kernel(kvm)) goto out; - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { ret = vgic_v2_map_resources(kvm); - else + type = VGIC_V2; + } else { ret = vgic_v3_map_resources(kvm); + type = VGIC_V3; + } if (ret) - __kvm_vgic_destroy(kvm); - else - dist->ready = true; + goto out; + dist_base = dist->vgic_dist_base; + mutex_unlock(&kvm->arch.config_lock); + + ret = vgic_register_dist_iodev(kvm, dist_base, type); + if (ret) { + kvm_err("Unable to register VGIC dist MMIO regions\n"); + goto out_slots; + } + + smp_store_release(&dist->ready, true); + goto out_slots; out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); +out_slots: + if (ret) + kvm_vm_dead(kvm); + + mutex_unlock(&kvm->slots_lock); + return ret; } /* GENERIC PROBE */ -static int vgic_init_cpu_starting(unsigned int cpu) +void kvm_vgic_cpu_up(void) { enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); - return 0; } -static int vgic_init_cpu_dying(unsigned int cpu) +void kvm_vgic_cpu_down(void) { disable_percpu_irq(kvm_vgic_global_state.maint_irq); - return 0; } static irqreturn_t vgic_maintenance_handler(int irq, void *data) { + struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)data; + /* * We cannot rely on the vgic maintenance interrupt to be * delivered synchronously. This means we can only use it to * exit the VM, and we perform the handling of EOIed * interrupts on the exit path (see vgic_fold_lr_state). + * + * Of course, NV throws a wrench in this plan, and needs + * something special. */ + if (vcpu && vgic_state_is_nested(vcpu)) + vgic_v3_handle_nested_maint_irq(vcpu); + return IRQ_HANDLED; } @@ -512,10 +672,12 @@ void kvm_vgic_init_cpu_hardware(void) * We want to make sure the list registers start out clear so that we * only have the program the used registers. */ - if (kvm_vgic_global_state.type == VGIC_V2) + if (kvm_vgic_global_state.type == VGIC_V2) { vgic_v2_init_lrs(); - else + } else if (kvm_vgic_global_state.type == VGIC_V3 || + kvm_vgic_global_state.has_gcie_v3_compat) { kvm_call_hyp(__vgic_v3_init_lrs); + } } /** @@ -560,6 +722,9 @@ int kvm_vgic_hyp_init(void) kvm_info("GIC system register CPU interface enabled\n"); } break; + case GIC_V5: + ret = vgic_v5_probe(gic_kvm_info); + break; default: ret = -ENODEV; } @@ -572,7 +737,7 @@ int kvm_vgic_hyp_init(void) if (ret) return ret; - if (!has_mask) + if (!has_mask && !kvm_vgic_global_state.maint_irq) return 0; ret = request_percpu_irq(kvm_vgic_global_state.maint_irq, @@ -584,19 +749,6 @@ int kvm_vgic_hyp_init(void) return ret; } - ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, - "kvm/arm/vgic:starting", - vgic_init_cpu_starting, vgic_init_cpu_dying); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); return 0; - -out_free_irq: - free_percpu_irq(kvm_vgic_global_state.maint_irq, - kvm_get_running_vcpus()); - return ret; } diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c index 475059bacedf..c314c016659a 100644 --- a/arch/arm64/kvm/vgic/vgic-irqfd.c +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -9,7 +9,7 @@ #include <kvm/arm_vgic.h> #include "vgic.h" -/** +/* * vgic_irqfd_set_irq: inject the IRQ corresponding to the * irqchip routing entry * @@ -23,7 +23,7 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e, if (!vgic_valid_spi(kvm, spi_id)) return -EINVAL; - return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL); + return kvm_vgic_inject_irq(kvm, NULL, spi_id, level, NULL); } /** @@ -75,7 +75,8 @@ static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e, msi->flags = e->msi.flags; msi->devid = e->msi.devid; } -/** + +/* * kvm_set_msi: inject the MSI corresponding to the * MSI routing entry * @@ -98,7 +99,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return vgic_its_inject_msi(kvm, &msi); } -/** +/* * kvm_arch_set_irq_inatomic: fast-path for irqfd injection */ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 94a666dd1443..3f1c4b10fed9 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -23,12 +23,49 @@ #include "vgic.h" #include "vgic-mmio.h" +static struct kvm_device_ops kvm_arm_vgic_its_ops; + static int vgic_its_save_tables_v0(struct vgic_its *its); static int vgic_its_restore_tables_v0(struct vgic_its *its); static int vgic_its_commit_v0(struct vgic_its *its); static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, struct kvm_vcpu *filter_vcpu, bool needs_inv); +#define vgic_its_read_entry_lock(i, g, valp, t) \ + ({ \ + int __sz = vgic_its_get_abi(i)->t##_esz; \ + struct kvm *__k = (i)->dev->kvm; \ + int __ret; \ + \ + BUILD_BUG_ON(NR_ITS_ABIS == 1 && \ + sizeof(*(valp)) != ABI_0_ESZ); \ + if (NR_ITS_ABIS > 1 && \ + KVM_BUG_ON(__sz != sizeof(*(valp)), __k)) \ + __ret = -EINVAL; \ + else \ + __ret = kvm_read_guest_lock(__k, (g), \ + valp, __sz); \ + __ret; \ + }) + +#define vgic_its_write_entry_lock(i, g, val, t) \ + ({ \ + int __sz = vgic_its_get_abi(i)->t##_esz; \ + struct kvm *__k = (i)->dev->kvm; \ + typeof(val) __v = (val); \ + int __ret; \ + \ + BUILD_BUG_ON(NR_ITS_ABIS == 1 && \ + sizeof(__v) != ABI_0_ESZ); \ + if (NR_ITS_ABIS > 1 && \ + KVM_BUG_ON(__sz != sizeof(__v), __k)) \ + __ret = -EINVAL; \ + else \ + __ret = vgic_write_guest_lock(__k, (g), \ + &__v, __sz); \ + __ret; \ + }) + /* * Creates a new (reference to a) struct vgic_irq for a given LPI. * If this LPI is already mapped on another ITS, we increase its refcount @@ -40,7 +77,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, struct kvm_vcpu *vcpu) { struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq; + struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; unsigned long flags; int ret; @@ -52,45 +89,44 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (!irq) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&irq->lpi_list); + ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); + if (ret) { + kfree(irq); + return ERR_PTR(ret); + } + INIT_LIST_HEAD(&irq->ap_list); raw_spin_lock_init(&irq->irq_lock); irq->config = VGIC_CONFIG_EDGE; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 1); irq->intid = intid; irq->target_vcpu = vcpu; irq->group = 1; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + xa_lock_irqsave(&dist->lpi_xa, flags); /* * There could be a race with another vgic_add_lpi(), so we need to * check that we don't add a second list entry with the same LPI. */ - list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { - if (oldirq->intid != intid) - continue; - + oldirq = xa_load(&dist->lpi_xa, intid); + if (vgic_try_get_irq_ref(oldirq)) { /* Someone was faster with adding this LPI, lets use that. */ kfree(irq); irq = oldirq; - - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() on the returned pointer once it's - * finished with the IRQ. - */ - vgic_get_irq_kref(irq); - - goto out_unlock; + } else { + ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0)); } - list_add_tail(&irq->lpi_list, &dist->lpi_list_head); - dist->lpi_list_count++; + xa_unlock_irqrestore(&dist->lpi_xa, flags); + + if (ret) { + xa_release(&dist->lpi_xa, intid); + kfree(irq); -out_unlock: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + return ERR_PTR(ret); + } /* * We "cache" the configuration table entries in our struct vgic_irq's. @@ -115,50 +151,12 @@ out_unlock: return irq; } -struct its_device { - struct list_head dev_list; - - /* the head for the list of ITTEs */ - struct list_head itt_head; - u32 num_eventid_bits; - gpa_t itt_addr; - u32 device_id; -}; - -#define COLLECTION_NOT_MAPPED ((u32)~0) - -struct its_collection { - struct list_head coll_list; - - u32 collection_id; - u32 target_addr; -}; - -#define its_is_collection_mapped(coll) ((coll) && \ - ((coll)->target_addr != COLLECTION_NOT_MAPPED)) - -struct its_ite { - struct list_head ite_list; - - struct vgic_irq *irq; - struct its_collection *collection; - u32 event_id; -}; - -struct vgic_translation_cache_entry { - struct list_head entry; - phys_addr_t db; - u32 devid; - u32 eventid; - struct vgic_irq *irq; -}; - /** * struct vgic_its_abi - ITS abi ops and settings * @cte_esz: collection table entry size * @dte_esz: device table entry size * @ite_esz: interrupt translation table entry size - * @save tables: save the ITS tables into guest RAM + * @save_tables: save the ITS tables into guest RAM * @restore_tables: restore the ITS internal structs from tables * stored in guest RAM * @commit: initialize the registers which expose the ABI settings, @@ -247,8 +245,10 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, #define GIC_LPI_OFFSET 8192 -#define VITS_TYPER_IDBITS 16 -#define VITS_TYPER_DEVBITS 16 +#define VITS_TYPER_IDBITS 16 +#define VITS_MAX_EVENTID (BIT(VITS_TYPER_IDBITS) - 1) +#define VITS_TYPER_DEVBITS 16 +#define VITS_MAX_DEVID (BIT(VITS_TYPER_DEVBITS) - 1) #define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1) #define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1) @@ -303,79 +303,40 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, } } - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - if (irq->hw) - return its_prop_update_vlpi(irq->host_irq, prop, needs_inv); - - return 0; -} + ret = its_prop_update_vlpi(irq->host_irq, prop, needs_inv); -/* - * Create a snapshot of the current LPIs targeting @vcpu, so that we can - * enumerate those LPIs without holding any lock. - * Returns their number and puts the kmalloc'ed array into intid_ptr. - */ -int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq; - unsigned long flags; - u32 *intids; - int irq_count, i = 0; - - /* - * There is an obvious race between allocating the array and LPIs - * being mapped/unmapped. If we ended up here as a result of a - * command, we're safe (locks are held, preventing another - * command). If coming from another path (such as enabling LPIs), - * we must be careful not to overrun the array. - */ - irq_count = READ_ONCE(dist->lpi_list_count); - intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT); - if (!intids) - return -ENOMEM; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - if (i == irq_count) - break; - /* We don't need to "get" the IRQ, as we hold the list lock. */ - if (vcpu && irq->target_vcpu != vcpu) - continue; - intids[i++] = irq->intid; - } - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); - - *intid_ptr = intids; - return i; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + return ret; } static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) { - int ret = 0; - unsigned long flags; + struct its_vlpi_map map; + int ret; - raw_spin_lock_irqsave(&irq->irq_lock, flags); + guard(raw_spinlock_irqsave)(&irq->irq_lock); irq->target_vcpu = vcpu; - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - if (irq->hw) { - struct its_vlpi_map map; + if (!irq->hw) + return 0; - ret = its_get_vlpi(irq->host_irq, &map); - if (ret) - return ret; + ret = its_get_vlpi(irq->host_irq, &map); + if (ret) + return ret; - if (map.vpe) - atomic_dec(&map.vpe->vlpi_count); - map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - atomic_inc(&map.vpe->vlpi_count); + if (map.vpe) + atomic_dec(&map.vpe->vlpi_count); - ret = its_map_vlpi(irq->host_irq, &map); - } + map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + atomic_inc(&map.vpe->vlpi_count); + return its_map_vlpi(irq->host_irq, &map); +} - return ret; +static struct kvm_vcpu *collection_to_vcpu(struct kvm *kvm, + struct its_collection *col) +{ + return kvm_get_vcpu_by_id(kvm, col->target_addr); } /* @@ -391,7 +352,7 @@ static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite) if (!its_is_collection_mapped(ite->collection)) return; - vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); + vcpu = collection_to_vcpu(kvm, ite->collection); update_affinity(ite->irq, vcpu); } @@ -428,23 +389,18 @@ static u32 max_lpis_propbaser(u64 propbaser) static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) { gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + unsigned long intid, flags; struct vgic_irq *irq; int last_byte_offset = -1; int ret = 0; - u32 *intids; - int nr_irqs, i; - unsigned long flags; u8 pendmask; - nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids); - if (nr_irqs < 0) - return nr_irqs; - - for (i = 0; i < nr_irqs; i++) { + xa_for_each(&dist->lpi_xa, intid, irq) { int byte_offset, bit_nr; - byte_offset = intids[i] / BITS_PER_BYTE; - bit_nr = intids[i] % BITS_PER_BYTE; + byte_offset = intid / BITS_PER_BYTE; + bit_nr = intid % BITS_PER_BYTE; /* * For contiguously allocated LPIs chances are we just read @@ -454,22 +410,23 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) ret = kvm_read_guest_lock(vcpu->kvm, pendbase + byte_offset, &pendmask, 1); - if (ret) { - kfree(intids); + if (ret) return ret; - } + last_byte_offset = byte_offset; } - irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); + irq = vgic_get_irq(vcpu->kvm, intid); + if (!irq) + continue; + raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = pendmask & (1U << bit_nr); + if (irq->target_vcpu == vcpu) + irq->pending_latch = pendmask & (1U << bit_nr); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } - kfree(intids); - return ret; } @@ -545,47 +502,52 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, return 0; } -static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist, - phys_addr_t db, - u32 devid, u32 eventid) +static struct vgic_its *__vgic_doorbell_to_its(struct kvm *kvm, gpa_t db) { - struct vgic_translation_cache_entry *cte; + struct kvm_io_device *kvm_io_dev; + struct vgic_io_device *iodev; - list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { - /* - * If we hit a NULL entry, there is nothing after this - * point. - */ - if (!cte->irq) - break; + kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, db); + if (!kvm_io_dev) + return ERR_PTR(-EINVAL); - if (cte->db != db || cte->devid != devid || - cte->eventid != eventid) - continue; + if (kvm_io_dev->ops != &kvm_io_gic_ops) + return ERR_PTR(-EINVAL); - /* - * Move this entry to the head, as it is the most - * recently used. - */ - if (!list_is_first(&cte->entry, &dist->lpi_translation_cache)) - list_move(&cte->entry, &dist->lpi_translation_cache); + iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); + if (iodev->iodev_type != IODEV_ITS) + return ERR_PTR(-EINVAL); - return cte->irq; - } + return iodev->its; +} + +static unsigned long vgic_its_cache_key(u32 devid, u32 eventid) +{ + return (((unsigned long)devid) << VITS_TYPER_IDBITS) | eventid; - return NULL; } static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, u32 devid, u32 eventid) { - struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long cache_key = vgic_its_cache_key(devid, eventid); + struct vgic_its *its; struct vgic_irq *irq; - unsigned long flags; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - irq = __vgic_its_check_cache(dist, db, devid, eventid); - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + if (devid > VITS_MAX_DEVID || eventid > VITS_MAX_EVENTID) + return NULL; + + its = __vgic_doorbell_to_its(kvm, db); + if (IS_ERR(its)) + return NULL; + + rcu_read_lock(); + + irq = xa_load(&its->translation_cache, cache_key); + if (!vgic_try_get_irq_ref(irq)) + irq = NULL; + + rcu_read_unlock(); return irq; } @@ -594,76 +556,68 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq *irq) { - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_translation_cache_entry *cte; - unsigned long flags; - phys_addr_t db; + unsigned long cache_key = vgic_its_cache_key(devid, eventid); + struct vgic_irq *old; /* Do not cache a directly injected interrupt */ if (irq->hw) return; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - - if (unlikely(list_empty(&dist->lpi_translation_cache))) - goto out; - /* - * We could have raced with another CPU caching the same - * translation behind our back, so let's check it is not in - * already + * The irq refcount is guaranteed to be nonzero while holding the + * its_lock, as the ITE (and the reference it holds) cannot be freed. */ - db = its->vgic_its_base + GITS_TRANSLATER; - if (__vgic_its_check_cache(dist, db, devid, eventid)) - goto out; + lockdep_assert_held(&its->its_lock); + vgic_get_irq_ref(irq); - /* Always reuse the last entry (LRU policy) */ - cte = list_last_entry(&dist->lpi_translation_cache, - typeof(*cte), entry); + old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); /* - * Caching the translation implies having an extra reference - * to the interrupt, so drop the potential reference on what - * was in the cache, and increment it on the new interrupt. + * Put the reference taken on @irq if the store fails. Intentionally do + * not return the error as the translation cache is best effort. */ - if (cte->irq) - __vgic_put_lpi_locked(kvm, cte->irq); - - vgic_get_irq_kref(irq); + if (xa_is_err(old)) { + vgic_put_irq(kvm, irq); + return; + } - cte->db = db; - cte->devid = devid; - cte->eventid = eventid; - cte->irq = irq; + /* + * We could have raced with another CPU caching the same + * translation behind our back, ensure we don't leak a + * reference if that is the case. + */ + if (old) + vgic_put_irq(kvm, old); +} - /* Move the new translation to the head of the list */ - list_move(&cte->entry, &dist->lpi_translation_cache); +static void vgic_its_invalidate_cache(struct vgic_its *its) +{ + struct kvm *kvm = its->dev->kvm; + struct vgic_irq *irq; + unsigned long idx; -out: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + xa_for_each(&its->translation_cache, idx, irq) { + xa_erase(&its->translation_cache, idx); + vgic_put_irq(kvm, irq); + } } -void vgic_its_invalidate_cache(struct kvm *kvm) +void vgic_its_invalidate_all_caches(struct kvm *kvm) { - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_translation_cache_entry *cte; - unsigned long flags; + struct kvm_device *dev; + struct vgic_its *its; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + rcu_read_lock(); - list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { - /* - * If we hit a NULL entry, there is nothing after this - * point. - */ - if (!cte->irq) - break; + list_for_each_entry_rcu(dev, &kvm->devices, vm_node) { + if (dev->ops != &kvm_arm_vgic_its_ops) + continue; - __vgic_put_lpi_locked(kvm, cte->irq); - cte->irq = NULL; + its = dev->private; + vgic_its_invalidate_cache(its); } - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + rcu_read_unlock(); } int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, @@ -679,7 +633,7 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, if (!ite || !its_is_collection_mapped(ite->collection)) return E_ITS_INT_UNMAPPED_INTERRUPT; - vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); + vcpu = collection_to_vcpu(kvm, ite->collection); if (!vcpu) return E_ITS_INT_UNMAPPED_INTERRUPT; @@ -695,8 +649,6 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) { u64 address; - struct kvm_io_device *kvm_io_dev; - struct vgic_io_device *iodev; if (!vgic_has_its(kvm)) return ERR_PTR(-ENODEV); @@ -706,18 +658,7 @@ struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) address = (u64)msi->address_hi << 32 | msi->address_lo; - kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); - if (!kvm_io_dev) - return ERR_PTR(-EINVAL); - - if (kvm_io_dev->ops != &kvm_io_gic_ops) - return ERR_PTR(-EINVAL); - - iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); - if (iodev->iodev_type != IODEV_ITS) - return ERR_PTR(-EINVAL); - - return iodev->its; + return __vgic_doorbell_to_its(kvm, address); } /* @@ -763,6 +704,7 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; vgic_queue_irq_unlock(kvm, irq, flags); + vgic_put_irq(kvm, irq); return 0; } @@ -806,12 +748,17 @@ int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) /* Requires the its_lock to be held. */ static void its_free_ite(struct kvm *kvm, struct its_ite *ite) { + struct vgic_irq *irq = ite->irq; list_del(&ite->ite_list); /* This put matches the get in vgic_add_lpi. */ - if (ite->irq) { - if (ite->irq->hw) - WARN_ON(its_unmap_vlpi(ite->irq->host_irq)); + if (irq) { + scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) { + if (irq->hw) + its_unmap_vlpi(ite->irq->host_irq); + + irq->hw = false; + } vgic_put_irq(kvm, ite->irq); } @@ -847,15 +794,19 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, ite = find_ite(its, device_id, event_id); if (ite && its_is_collection_mapped(ite->collection)) { + struct its_device *device = find_its_device(its, device_id); + int ite_esz = vgic_its_get_abi(its)->ite_esz; + gpa_t gpa = device->itt_addr + ite->event_id * ite_esz; /* * Though the spec talks about removing the pending state, we * don't bother here since we clear the ITTE anyway and the * pending state is a property of the ITTE struct. */ - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); its_free_ite(kvm, ite); - return 0; + + return vgic_its_write_entry_lock(its, gpa, 0ULL, ite); } return E_ITS_DISCARD_UNMAPPED_INTERRUPT; @@ -887,9 +838,9 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, return E_ITS_MOVI_UNMAPPED_COLLECTION; ite->collection = collection; - vcpu = kvm_get_vcpu(kvm, collection->target_addr); + vcpu = collection_to_vcpu(kvm, collection); - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); return update_affinity(ite->irq, vcpu); } @@ -924,7 +875,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, switch (type) { case GITS_BASER_TYPE_DEVICE: - if (id >= BIT_ULL(VITS_TYPER_DEVBITS)) + if (id > VITS_MAX_DEVID) return false; break; case GITS_BASER_TYPE_COLLECTION: @@ -1121,7 +1072,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, } if (its_is_collection_mapped(collection)) - vcpu = kvm_get_vcpu(kvm, collection->target_addr); + vcpu = collection_to_vcpu(kvm, collection); irq = vgic_add_lpi(kvm, lpi_nr, vcpu); if (IS_ERR(irq)) { @@ -1136,7 +1087,8 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, } /* Requires the its_lock to be held. */ -static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) +static void vgic_its_free_device(struct kvm *kvm, struct vgic_its *its, + struct its_device *device) { struct its_ite *ite, *temp; @@ -1148,7 +1100,7 @@ static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list) its_free_ite(kvm, ite); - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); list_del(&device->dev_list); kfree(device); @@ -1160,7 +1112,7 @@ static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) struct its_device *cur, *temp; list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) - vgic_its_free_device(kvm, cur); + vgic_its_free_device(kvm, its, cur); } /* its lock must be held */ @@ -1204,8 +1156,9 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, u8 num_eventid_bits = its_cmd_get_size(its_cmd); gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); struct its_device *device; + gpa_t gpa; - if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL)) + if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa)) return E_ITS_MAPD_DEVICE_OOR; if (valid && num_eventid_bits > VITS_TYPER_IDBITS) @@ -1219,14 +1172,14 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, * by removing the mapping and re-establishing it. */ if (device) - vgic_its_free_device(kvm, device); + vgic_its_free_device(kvm, its, device); /* * The spec does not say whether unmapping a not-mapped device * is an error, so we are done in any case. */ if (!valid) - return 0; + return vgic_its_write_entry_lock(its, gpa, 0ULL, dte); device = vgic_its_alloc_device(its, device_id, itt_addr, num_eventid_bits); @@ -1242,21 +1195,22 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { u16 coll_id; - u32 target_addr; struct its_collection *collection; bool valid; valid = its_cmd_get_validbit(its_cmd); coll_id = its_cmd_get_collection(its_cmd); - target_addr = its_cmd_get_target_addr(its_cmd); - - if (target_addr >= atomic_read(&kvm->online_vcpus)) - return E_ITS_MAPC_PROCNUM_OOR; if (!valid) { vgic_its_free_collection(its, coll_id); - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); } else { + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd)); + if (!vcpu) + return E_ITS_MAPC_PROCNUM_OOR; + collection = find_collection(its, coll_id); if (!collection) { @@ -1270,9 +1224,9 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, coll_id); if (ret) return ret; - collection->target_addr = target_addr; + collection->target_addr = vcpu->vcpu_id; } else { - collection->target_addr = target_addr; + collection->target_addr = vcpu->vcpu_id; update_affinity_collection(kvm, its, collection); } } @@ -1330,8 +1284,8 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, } /** - * vgic_its_invall - invalidate all LPIs targetting a given vcpu - * @vcpu: the vcpu for which the RD is targetted by an invalidation + * vgic_its_invall - invalidate all LPIs targeting a given vcpu + * @vcpu: the vcpu for which the RD is targeted by an invalidation * * Contrary to the INVALL command, this targets a RD instead of a * collection, and we don't need to hold the its_lock, since no ITS is @@ -1340,23 +1294,19 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, int vgic_its_invall(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - int irq_count, i = 0; - u32 *intids; - - irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); - if (irq_count < 0) - return irq_count; + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + unsigned long intid; - for (i = 0; i < irq_count; i++) { - struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intids[i]); + xa_for_each(&dist->lpi_xa, intid, irq) { + irq = vgic_get_irq(kvm, intid); if (!irq) continue; + update_lpi_config(kvm, irq, vcpu, false); vgic_put_irq(kvm, irq); } - kfree(intids); - if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); @@ -1382,7 +1332,7 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, if (!its_is_collection_mapped(collection)) return E_ITS_INVALL_UNMAPPED_COLLECTION; - vcpu = kvm_get_vcpu(kvm, collection->target_addr); + vcpu = collection_to_vcpu(kvm, collection); vgic_its_invall(vcpu); return 0; @@ -1399,38 +1349,33 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { - u32 target1_addr = its_cmd_get_target_addr(its_cmd); - u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32); + struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu1, *vcpu2; struct vgic_irq *irq; - u32 *intids; - int irq_count, i; + unsigned long intid; + + /* We advertise GITS_TYPER.PTA==0, making the address the vcpu ID */ + vcpu1 = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd)); + vcpu2 = kvm_get_vcpu_by_id(kvm, its_cmd_mask_field(its_cmd, 3, 16, 32)); - if (target1_addr >= atomic_read(&kvm->online_vcpus) || - target2_addr >= atomic_read(&kvm->online_vcpus)) + if (!vcpu1 || !vcpu2) return E_ITS_MOVALL_PROCNUM_OOR; - if (target1_addr == target2_addr) + if (vcpu1 == vcpu2) return 0; - vcpu1 = kvm_get_vcpu(kvm, target1_addr); - vcpu2 = kvm_get_vcpu(kvm, target2_addr); - - irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids); - if (irq_count < 0) - return irq_count; - - for (i = 0; i < irq_count; i++) { - irq = vgic_get_irq(kvm, NULL, intids[i]); + xa_for_each(&dist->lpi_xa, intid, irq) { + irq = vgic_get_irq(kvm, intid); + if (!irq) + continue; update_affinity(irq, vcpu2); vgic_put_irq(kvm, irq); } - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); - kfree(intids); return 0; } @@ -1781,7 +1726,7 @@ static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, its->enabled = !!(val & GITS_CTLR_ENABLE); if (!its->enabled) - vgic_its_invalidate_cache(kvm); + vgic_its_invalidate_cache(its); /* * Try to process any pending commands. This function bails out early @@ -1882,47 +1827,6 @@ out: return ret; } -/* Default is 16 cached LPIs per vcpu */ -#define LPI_DEFAULT_PCPU_CACHE_SIZE 16 - -void vgic_lpi_translation_cache_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - unsigned int sz; - int i; - - if (!list_empty(&dist->lpi_translation_cache)) - return; - - sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE; - - for (i = 0; i < sz; i++) { - struct vgic_translation_cache_entry *cte; - - /* An allocation failure is not fatal */ - cte = kzalloc(sizeof(*cte), GFP_KERNEL_ACCOUNT); - if (WARN_ON(!cte)) - break; - - INIT_LIST_HEAD(&cte->entry); - list_add(&cte->entry, &dist->lpi_translation_cache); - } -} - -void vgic_lpi_translation_cache_destroy(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_translation_cache_entry *cte, *tmp; - - vgic_its_invalidate_cache(kvm); - - list_for_each_entry_safe(cte, tmp, - &dist->lpi_translation_cache, entry) { - list_del(&cte->entry); - kfree(cte); - } -} - #define INITIAL_BASER_VALUE \ (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ @@ -1936,6 +1840,7 @@ void vgic_lpi_translation_cache_destroy(struct kvm *kvm) static int vgic_its_create(struct kvm_device *dev, u32 type) { + int ret; struct vgic_its *its; if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) @@ -1945,23 +1850,33 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) if (!its) return -ENOMEM; + mutex_lock(&dev->kvm->arch.config_lock); + if (vgic_initialized(dev->kvm)) { - int ret = vgic_v4_init(dev->kvm); + ret = vgic_v4_init(dev->kvm); if (ret < 0) { + mutex_unlock(&dev->kvm->arch.config_lock); kfree(its); return ret; } - - vgic_lpi_translation_cache_init(dev->kvm); } mutex_init(&its->its_lock); mutex_init(&its->cmd_lock); + /* Yep, even more trickery for lock ordering... */ +#ifdef CONFIG_LOCKDEP + mutex_lock(&its->cmd_lock); + mutex_lock(&its->its_lock); + mutex_unlock(&its->its_lock); + mutex_unlock(&its->cmd_lock); +#endif + its->vgic_its_base = VGIC_ADDR_UNDEF; INIT_LIST_HEAD(&its->device_list); INIT_LIST_HEAD(&its->collection_list); + xa_init(&its->translation_cache); dev->kvm->arch.vgic.msis_require_devid = true; dev->kvm->arch.vgic.has_its = true; @@ -1976,7 +1891,11 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) dev->private = its; - return vgic_its_set_abi(its, NR_ITS_ABIS - 1); + ret = vgic_its_set_abi(its, NR_ITS_ABIS - 1); + + mutex_unlock(&dev->kvm->arch.config_lock); + + return ret; } static void vgic_its_destroy(struct kvm_device *kvm_dev) @@ -1986,8 +1905,12 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev) mutex_lock(&its->its_lock); + vgic_its_debug_destroy(kvm_dev); + vgic_its_free_device_list(kvm, its); vgic_its_free_collection_list(kvm, its); + vgic_its_invalidate_cache(its); + xa_destroy(&its->translation_cache); mutex_unlock(&its->its_lock); kfree(its); @@ -2045,6 +1968,13 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->lock); + if (kvm_trylock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + + mutex_lock(&dev->kvm->arch.config_lock); + if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { ret = -ENXIO; goto out; @@ -2058,11 +1988,6 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev, goto out; } - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - addr = its->vgic_its_base + offset; len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4; @@ -2076,8 +2001,9 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev, } else { *reg = region->its_read(dev->kvm, its, addr, len); } - unlock_all_vcpus(dev->kvm); out: + mutex_unlock(&dev->kvm->arch.config_lock); + kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); return ret; } @@ -2110,7 +2036,7 @@ static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite) } /** - * entry_fn_t - Callback called on a table entry restore path + * typedef entry_fn_t - Callback called on a table entry restore path * @its: its handle * @id: id of the entry * @entry: pointer to the entry @@ -2133,6 +2059,7 @@ typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, * @start_id: the ID of the first entry in the table * (non zero for 2d level tables) * @fn: function to apply on each entry + * @opaque: pointer to opaque data * * Return: < 0 on error, 0 if last element was identified, 1 otherwise * (the last element may not be found on second level tables) @@ -2172,13 +2099,12 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, return 1; } -/** +/* * vgic_its_save_ite - Save an interrupt translation entry at @gpa */ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, - struct its_ite *ite, gpa_t gpa, int ite_esz) + struct its_ite *ite, gpa_t gpa) { - struct kvm *kvm = its->dev->kvm; u32 next_offset; u64 val; @@ -2187,11 +2113,14 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); + + return vgic_its_write_entry_lock(its, gpa, val, ite); } /** * vgic_its_restore_ite - restore an interrupt translation entry + * + * @its: its handle * @event_id: id used for indexing * @ptr: pointer to the ITE entry * @opaque: pointer to the its_device @@ -2239,7 +2168,7 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, return PTR_ERR(ite); if (its_is_collection_mapped(collection)) - vcpu = kvm_get_vcpu(kvm, collection->target_addr); + vcpu = kvm_get_vcpu_by_id(kvm, collection->target_addr); irq = vgic_add_lpi(kvm, lpi_id, vcpu); if (IS_ERR(irq)) { @@ -2285,7 +2214,7 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1) return -EACCES; - ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz); + ret = vgic_its_save_ite(its, device, ite, gpa); if (ret) return ret; } @@ -2326,9 +2255,8 @@ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) * @ptr: GPA */ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, - gpa_t ptr, int dte_esz) + gpa_t ptr) { - struct kvm *kvm = its->dev->kvm; u64 val, itt_addr_field; u32 next_offset; @@ -2339,7 +2267,8 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); + + return vgic_its_write_entry_lock(its, ptr, val, dte); } /** @@ -2387,7 +2316,7 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id, ret = vgic_its_restore_itt(its, dev); if (ret) { - vgic_its_free_device(its->dev->kvm, dev); + vgic_its_free_device(its->dev->kvm, its, dev); return ret; } @@ -2406,7 +2335,7 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a, return 1; } -/** +/* * vgic_its_save_device_tables - Save the device table and all ITT * into guest RAM * @@ -2415,10 +2344,8 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a, */ static int vgic_its_save_device_tables(struct vgic_its *its) { - const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 baser = its->baser_device_table; struct its_device *dev; - int dte_esz = abi->dte_esz; if (!(baser & GITS_BASER_VALID)) return 0; @@ -2437,7 +2364,7 @@ static int vgic_its_save_device_tables(struct vgic_its *its) if (ret) return ret; - ret = vgic_its_save_dte(its, dev, eaddr, dte_esz); + ret = vgic_its_save_dte(its, dev, eaddr); if (ret) return ret; } @@ -2479,7 +2406,7 @@ static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, return ret; } -/** +/* * vgic_its_restore_device_tables - Restore the device table and all ITT * from guest RAM to internal data structs */ @@ -2518,7 +2445,7 @@ static int vgic_its_restore_device_tables(struct vgic_its *its) static int vgic_its_save_cte(struct vgic_its *its, struct its_collection *collection, - gpa_t gpa, int esz) + gpa_t gpa) { u64 val; @@ -2526,7 +2453,8 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); + + return vgic_its_write_entry_lock(its, gpa, val, cte); } /* @@ -2534,7 +2462,7 @@ static int vgic_its_save_cte(struct vgic_its *its, * Return +1 on success, 0 if the entry was invalid (which should be * interpreted as end-of-table), and a negative error value for generic errors. */ -static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) +static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa) { struct its_collection *collection; struct kvm *kvm = its->dev->kvm; @@ -2542,8 +2470,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) u64 val; int ret; - BUG_ON(esz > sizeof(val)); - ret = kvm_read_guest_lock(kvm, gpa, &val, esz); + ret = vgic_its_read_entry_lock(its, gpa, &val, cte); if (ret) return ret; val = le64_to_cpu(val); @@ -2554,7 +2481,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) coll_id = val & KVM_ITS_CTE_ICID_MASK; if (target_addr != COLLECTION_NOT_MAPPED && - target_addr >= atomic_read(&kvm->online_vcpus)) + !kvm_get_vcpu_by_id(kvm, target_addr)) return -EINVAL; collection = find_collection(its, coll_id); @@ -2571,7 +2498,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) return 1; } -/** +/* * vgic_its_save_collection_table - Save the collection table into * guest RAM */ @@ -2581,7 +2508,6 @@ static int vgic_its_save_collection_table(struct vgic_its *its) u64 baser = its->baser_coll_table; gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); struct its_collection *collection; - u64 val; size_t max_size, filled = 0; int ret, cte_esz = abi->cte_esz; @@ -2591,7 +2517,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; list_for_each_entry(collection, &its->collection_list, coll_list) { - ret = vgic_its_save_cte(its, collection, gpa, cte_esz); + ret = vgic_its_save_cte(its, collection, gpa); if (ret) return ret; gpa += cte_esz; @@ -2605,13 +2531,10 @@ static int vgic_its_save_collection_table(struct vgic_its *its) * table is not fully filled, add a last dummy element * with valid bit unset */ - val = 0; - BUG_ON(cte_esz > sizeof(val)); - ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); - return ret; + return vgic_its_write_entry_lock(its, gpa, 0ULL, cte); } -/** +/* * vgic_its_restore_collection_table - reads the collection table * in guest memory and restores the ITS internal state. Requires the * BASER registers to be restored before. @@ -2633,7 +2556,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; while (read < max_size) { - ret = vgic_its_restore_cte(its, gpa, cte_esz); + ret = vgic_its_restore_cte(its, gpa); if (ret <= 0) break; gpa += cte_esz; @@ -2649,7 +2572,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) return ret; } -/** +/* * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM * according to v0 ABI */ @@ -2664,7 +2587,7 @@ static int vgic_its_save_tables_v0(struct vgic_its *its) return vgic_its_save_collection_table(its); } -/** +/* * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM * to internal data structs according to V0 ABI * @@ -2743,37 +2666,39 @@ static int vgic_its_has_attr(struct kvm_device *dev, static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); - struct vgic_dist *dist = &kvm->arch.vgic; int ret = 0; if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ return 0; mutex_lock(&kvm->lock); - mutex_lock(&its->its_lock); - if (!lock_all_vcpus(kvm)) { - mutex_unlock(&its->its_lock); + if (kvm_trylock_all_vcpus(kvm)) { mutex_unlock(&kvm->lock); return -EBUSY; } + mutex_lock(&kvm->arch.config_lock); + mutex_lock(&its->its_lock); + switch (attr) { case KVM_DEV_ARM_ITS_CTRL_RESET: vgic_its_reset(kvm, its); break; case KVM_DEV_ARM_ITS_SAVE_TABLES: - dist->save_its_tables_in_progress = true; ret = abi->save_tables(its); - dist->save_its_tables_in_progress = false; break; case KVM_DEV_ARM_ITS_RESTORE_TABLES: ret = abi->restore_tables(its); break; + default: + ret = -ENXIO; + break; } - unlock_all_vcpus(kvm); mutex_unlock(&its->its_lock); + mutex_unlock(&kvm->arch.config_lock); + kvm_unlock_all_vcpus(kvm); mutex_unlock(&kvm->lock); return ret; } @@ -2792,7 +2717,7 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - return dist->save_its_tables_in_progress; + return dist->table_write_in_progress; } static int vgic_its_set_attr(struct kvm_device *dev, @@ -2818,7 +2743,12 @@ static int vgic_its_set_attr(struct kvm_device *dev, if (ret) return ret; - return vgic_register_its_iodev(dev->kvm, its, addr); + ret = vgic_register_its_iodev(dev->kvm, its, addr); + if (ret) + return ret; + + return vgic_its_debug_init(dev); + } case KVM_DEV_ARM_VGIC_GRP_CTRL: return vgic_its_ctrl(dev->kvm, its, attr->attr); diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index edeac2380591..3d1a776b716d 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -5,6 +5,7 @@ * Copyright (C) 2015 ARM Ltd. * Author: Marc Zyngier <marc.zyngier@arm.com> */ +#include <linux/irqchip/arm-gic-v3.h> #include <linux/kvm_host.h> #include <kvm/arm_vgic.h> #include <linux/uaccess.h> @@ -27,7 +28,8 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, if (addr + size < addr) return -EINVAL; - if (addr & ~kvm_phys_mask(kvm) || addr + size > kvm_phys_size(kvm)) + if (addr & ~kvm_phys_mask(&kvm->arch.mmu) || + (addr + size) > kvm_phys_size(&kvm->arch.mmu)) return -E2BIG; return 0; @@ -46,7 +48,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev struct vgic_dist *vgic = &kvm->arch.vgic; int r; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); @@ -68,7 +70,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev r = -ENODEV; } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); return r; } @@ -102,7 +104,11 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri if (get_user(addr, uaddr)) return -EFAULT; - mutex_lock(&kvm->lock); + /* + * Since we can't hold config_lock while registering the redistributor + * iodevs, take the slots_lock immediately. + */ + mutex_lock(&kvm->slots_lock); switch (attr->attr) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); @@ -182,6 +188,7 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri if (r) goto out; + mutex_lock(&kvm->arch.config_lock); if (write) { r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size); if (!r) @@ -189,9 +196,10 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri } else { addr = *addr_ptr; } + mutex_unlock(&kvm->arch.config_lock); out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->slots_lock); if (!r && !write) r = put_user(addr, uaddr); @@ -227,24 +235,29 @@ static int vgic_set_common_attr(struct kvm_device *dev, (val & 31)) return -EINVAL; - mutex_lock(&dev->kvm->lock); + mutex_lock(&dev->kvm->arch.config_lock); - if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) + /* + * Either userspace has already configured NR_IRQS or + * the vgic has already been initialized and vgic_init() + * supplied a default amount of SPIs. + */ + if (dev->kvm->arch.vgic.nr_spis) ret = -EBUSY; else dev->kvm->arch.vgic.nr_spis = val - VGIC_NR_PRIVATE_IRQS; - mutex_unlock(&dev->kvm->lock); + mutex_unlock(&dev->kvm->arch.config_lock); return ret; } case KVM_DEV_ARM_VGIC_GRP_CTRL: { switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: - mutex_lock(&dev->kvm->lock); + mutex_lock(&dev->kvm->arch.config_lock); r = vgic_init(dev->kvm); - mutex_unlock(&dev->kvm->lock); + mutex_unlock(&dev->kvm->arch.config_lock); return r; case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: /* @@ -256,12 +269,15 @@ static int vgic_set_common_attr(struct kvm_device *dev, return -ENXIO; mutex_lock(&dev->kvm->lock); - if (!lock_all_vcpus(dev->kvm)) { + if (kvm_trylock_all_vcpus(dev->kvm)) { mutex_unlock(&dev->kvm->lock); return -EBUSY; } + + mutex_lock(&dev->kvm->arch.config_lock); r = vgic_v3_save_pending_tables(dev->kvm); - unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->arch.config_lock); + kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); return r; } @@ -328,58 +344,16 @@ int kvm_register_vgic_device(unsigned long type) int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr) { - int cpuid; + int cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr); - cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> - KVM_DEV_ARM_VGIC_CPUID_SHIFT; - - if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) - return -EINVAL; - - reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid); reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid); + if (!reg_attr->vcpu) + return -EINVAL; return 0; } -/* unlocks vcpus from @vcpu_lock_idx and smaller */ -static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) -{ - struct kvm_vcpu *tmp_vcpu; - - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); - mutex_unlock(&tmp_vcpu->mutex); - } -} - -void unlock_all_vcpus(struct kvm *kvm) -{ - unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); -} - -/* Returns true if all vcpus were locked, false otherwise */ -bool lock_all_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *tmp_vcpu; - unsigned long c; - - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run and fiddle with the vgic state while we - * access it. - */ - kvm_for_each_vcpu(c, tmp_vcpu, kvm) { - if (!mutex_trylock(&tmp_vcpu->mutex)) { - unlock_vcpus(kvm, c - 1); - return false; - } - } - - return true; -} - /** * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state * @@ -411,15 +385,17 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->lock); + if (kvm_trylock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + + mutex_lock(&dev->kvm->arch.config_lock); + ret = vgic_init(dev->kvm); if (ret) goto out; - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val); @@ -432,8 +408,9 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, break; } - unlock_all_vcpus(dev->kvm); out: + mutex_unlock(&dev->kvm->arch.config_lock); + kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); if (!ret && !is_write) @@ -528,6 +505,24 @@ int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, } /* + * Allow access to certain ID-like registers prior to VGIC initialization, + * thereby allowing the VMM to provision the features / sizing of the VGIC. + */ +static bool reg_allowed_pre_init(struct kvm_device_attr *attr) +{ + if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) + return false; + + switch (attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK) { + case GICD_IIDR: + case GICD_TYPER2: + return true; + default: + return false; + } +} + +/* * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state * * @dev: kvm device handle @@ -569,12 +564,14 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->lock); - if (unlikely(!vgic_initialized(dev->kvm))) { - ret = -EBUSY; - goto out; + if (kvm_trylock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; } - if (!lock_all_vcpus(dev->kvm)) { + mutex_lock(&dev->kvm->arch.config_lock); + + if (!(vgic_initialized(dev->kvm) || reg_allowed_pre_init(attr))) { ret = -EBUSY; goto out; } @@ -609,8 +606,9 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, break; } - unlock_all_vcpus(dev->kvm); out: + mutex_unlock(&dev->kvm->arch.config_lock); + kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); if (!ret && uaccess && !is_write) { @@ -630,6 +628,23 @@ static int vgic_v3_set_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: return vgic_v3_attr_regs_access(dev, attr, true); + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { + u32 __user *uaddr = (u32 __user *)attr->addr; + u32 val; + + if (get_user(val, uaddr)) + return -EFAULT; + + guard(mutex)(&dev->kvm->arch.config_lock); + if (vgic_initialized(dev->kvm)) + return -EBUSY; + + if (!irq_is_ppi(val)) + return -EINVAL; + + dev->kvm->arch.vgic.mi_intid = val; + return 0; + } default: return vgic_set_common_attr(dev, attr); } @@ -644,6 +659,12 @@ static int vgic_v3_get_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: return vgic_v3_attr_regs_access(dev, attr, false); + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + + guard(mutex)(&dev->kvm->arch.config_lock); + return put_user(dev->kvm->arch.vgic.mi_intid, uaddr); + } default: return vgic_get_common_attr(dev, attr); } @@ -666,6 +687,7 @@ static int vgic_v3_has_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: return vgic_v3_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: return 0; case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index e070cda86e12..406845b3117c 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -148,7 +148,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, if (!(targets & (1U << c))) continue; - irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); + irq = vgic_get_vcpu_irq(vcpu, intid); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; @@ -167,7 +167,7 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, u64 val = 0; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); val |= (u64)irq->targets << (i * 8); @@ -191,7 +191,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, return; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid + i); int target; raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -213,7 +213,7 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, u64 val = 0; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); val |= (u64)irq->source << (i * 8); @@ -231,7 +231,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, unsigned long flags; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -253,7 +253,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, unsigned long flags; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -359,6 +359,16 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, vgic_set_vmcr(vcpu, &vmcr); } +static void vgic_mmio_write_dir(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_deactivate(vcpu, val); + else + vgic_v3_deactivate(vcpu, val); +} + static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -482,6 +492,10 @@ static const struct vgic_register_region vgic_v2_cpu_registers[] = { REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_CPU_DEACTIVATE, + vgic_mmio_read_raz, vgic_mmio_write_dir, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, + 4, VGIC_ACCESS_32bit), }; unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) @@ -494,6 +508,16 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) return SZ_4K; } +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v2_cpu_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return KVM_VGIC_V2_CPU_SIZE; +} + int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) { const struct vgic_register_region *region; diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index 91201f743033..70d50c77e5dc 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -50,8 +50,25 @@ bool vgic_has_its(struct kvm *kvm) bool vgic_supports_direct_msis(struct kvm *kvm) { - return (kvm_vgic_global_state.has_gicv4_1 || - (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm))); + /* + * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware, + * indirectly allowing userspace to control whether or not vPEs are + * allocated for the VM. + */ + if (system_supports_direct_sgis() && !vgic_supports_direct_sgis(kvm)) + return false; + + return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm); +} + +bool system_supports_direct_sgis(void) +{ + return kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi(); +} + +bool vgic_supports_direct_sgis(struct kvm *kvm) +{ + return kvm->arch.vgic.nassgicap; } /* @@ -86,7 +103,7 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, } break; case GICD_TYPER2: - if (kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi()) + if (vgic_supports_direct_sgis(vcpu->kvm)) value = GICD_TYPER2_nASSGIcap; break; case GICD_IIDR: @@ -111,7 +128,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, case GICD_CTLR: { bool was_enabled, is_hwsgi; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); was_enabled = dist->enabled; is_hwsgi = dist->nassgireq; @@ -119,7 +136,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; /* Not a GICv4.1? No HW SGIs */ - if (!kvm_vgic_global_state.has_gicv4_1 || !gic_cpuif_has_vsgi()) + if (!vgic_supports_direct_sgis(vcpu->kvm)) val &= ~GICD_CTLR_nASSGIreq; /* Dist stays enabled? nASSGIreq is RO */ @@ -133,13 +150,13 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, if (is_hwsgi != dist->nassgireq) vgic_v4_configure_vsgis(vcpu->kvm); - if (kvm_vgic_global_state.has_gicv4_1 && + if (vgic_supports_direct_sgis(vcpu->kvm) && was_enabled != dist->enabled) kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4); else if (!was_enabled && dist->enabled) vgic_kick_vcpus(vcpu->kvm); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); break; } case GICD_TYPER: @@ -159,8 +176,18 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, switch (addr & 0x0c) { case GICD_TYPER2: - if (val != vgic_mmio_read_v3_misc(vcpu, addr, len)) + reg = vgic_mmio_read_v3_misc(vcpu, addr, len); + + if (reg == val) + return 0; + if (vgic_initialized(vcpu->kvm)) + return -EBUSY; + if ((reg ^ val) & ~GICD_TYPER2_nASSGIcap) return -EINVAL; + if (!system_supports_direct_sgis() && val) + return -EINVAL; + + dist->nassgicap = val & GICD_TYPER2_nASSGIcap; return 0; case GICD_IIDR: reg = vgic_mmio_read_v3_misc(vcpu, addr, len); @@ -178,7 +205,7 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, } case GICD_CTLR: /* Not a GICv4.1? No HW SGIs */ - if (!kvm_vgic_global_state.has_gicv4_1) + if (!vgic_supports_direct_sgis(vcpu->kvm)) val &= ~GICD_CTLR_nASSGIreq; dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; @@ -194,7 +221,7 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid); unsigned long ret = 0; if (!irq) @@ -220,7 +247,7 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, if (addr & 4) return; - irq = vgic_get_irq(vcpu->kvm, NULL, intid); + irq = vgic_get_irq(vcpu->kvm, intid); if (!irq) return; @@ -277,7 +304,7 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, return; vgic_flush_pending_lpis(vcpu); - vgic_its_invalidate_cache(vcpu->kvm); + vgic_its_invalidate_all_caches(vcpu->kvm); atomic_set_release(&vgic_cpu->ctlr, 0); } else { ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0, @@ -357,31 +384,13 @@ static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - if (test_bit(i, &val)) { - /* - * pending_latch is set irrespective of irq type - * (level or edge) to avoid dependency that VM should - * restore irq config before pending info. - */ - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } else { - irq->pending_latch = false; - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - } + int ret; - vgic_put_irq(vcpu->kvm, irq); - } + ret = vgic_uaccess_write_spending(vcpu, addr, len, val); + if (ret) + return ret; - return 0; + return vgic_uaccess_write_cpending(vcpu, addr, len, ~val); } /* We want to avoid outer shareable. */ @@ -548,6 +557,7 @@ static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu, unsigned long val) { struct vgic_irq *irq; + u32 intid; /* * If the guest wrote only to the upper 32bit part of the @@ -559,9 +569,13 @@ static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu, if ((addr & 4) || !vgic_lpis_enabled(vcpu)) return; + intid = lower_32_bits(val); + if (intid < VGIC_MIN_LPI) + return; + vgic_set_rdist_busy(vcpu, true); - irq = vgic_get_irq(vcpu->kvm, NULL, lower_32_bits(val)); + irq = vgic_get_irq(vcpu->kvm, intid); if (irq) { vgic_its_inv_lpi(vcpu->kvm, irq); vgic_put_irq(vcpu->kvm, irq); @@ -769,10 +783,13 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; struct vgic_redist_region *rdreg; gpa_t rd_base; - int ret; + int ret = 0; + + lockdep_assert_held(&kvm->slots_lock); + mutex_lock(&kvm->arch.config_lock); if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) - return 0; + goto out_unlock; /* * We may be creating VCPUs before having set the base address for the @@ -782,10 +799,12 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) */ rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions); if (!rdreg) - return 0; + goto out_unlock; - if (!vgic_v3_check_base(kvm)) - return -EINVAL; + if (!vgic_v3_check_base(kvm)) { + ret = -EINVAL; + goto out_unlock; + } vgic_cpu->rdreg = rdreg; vgic_cpu->rdreg_index = rdreg->free_index; @@ -799,19 +818,23 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); rd_dev->redist_vcpu = vcpu; - mutex_lock(&kvm->slots_lock); + mutex_unlock(&kvm->arch.config_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, 2 * SZ_64K, &rd_dev->dev); - mutex_unlock(&kvm->slots_lock); - if (ret) return ret; + /* Protected by slots_lock */ rdreg->free_index++; return 0; + +out_unlock: + mutex_unlock(&kvm->arch.config_lock); + return ret; } -static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) +void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) { struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; @@ -824,6 +847,8 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm) unsigned long c; int ret = 0; + lockdep_assert_held(&kvm->slots_lock); + kvm_for_each_vcpu(c, vcpu, kvm) { ret = vgic_register_redist_iodev(vcpu); if (ret) @@ -834,12 +859,10 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm) /* The current c failed, so iterate over the previous ones. */ int i; - mutex_lock(&kvm->slots_lock); for (i = 0; i < c; i++) { vcpu = kvm_get_vcpu(kvm, i); vgic_unregister_redist_iodev(vcpu); } - mutex_unlock(&kvm->slots_lock); } return ret; @@ -928,8 +951,19 @@ free: return ret; } -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg) +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg) { + struct kvm_vcpu *vcpu; + unsigned long c; + + lockdep_assert_held(&kvm->arch.config_lock); + + /* Garbage collect the region */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (vcpu->arch.vgic_cpu.rdreg == rdreg) + vcpu->arch.vgic_cpu.rdreg = NULL; + } + list_del(&rdreg->list); kfree(rdreg); } @@ -938,7 +972,9 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) { int ret; + mutex_lock(&kvm->arch.config_lock); ret = vgic_v3_alloc_redist_region(kvm, index, addr, count); + mutex_unlock(&kvm->arch.config_lock); if (ret) return ret; @@ -950,8 +986,10 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) if (ret) { struct vgic_redist_region *rdreg; + mutex_lock(&kvm->arch.config_lock); rdreg = vgic_v3_rdist_region_from_index(kvm, index); - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); + mutex_unlock(&kvm->arch.config_lock); return ret; } @@ -1002,35 +1040,6 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) return 0; } -/* - * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI - * generation register ICC_SGI1R_EL1) with a given VCPU. - * If the VCPU's MPIDR matches, return the level0 affinity, otherwise - * return -1. - */ -static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) -{ - unsigned long affinity; - int level0; - - /* - * Split the current VCPU's MPIDR into affinity level 0 and the - * rest as this is what we have to compare against. - */ - affinity = kvm_vcpu_get_mpidr_aff(vcpu); - level0 = MPIDR_AFFINITY_LEVEL(affinity, 0); - affinity &= ~MPIDR_LEVEL_MASK; - - /* bail out if the upper three levels don't match */ - if (sgi_aff != affinity) - return -1; - - /* Is this VCPU's bit set in the mask ? */ - if (!(sgi_cpu_mask & BIT(level0))) - return -1; - - return level0; -} /* * The ICC_SGI* registers encode the affinity differently from the MPIDR, @@ -1041,6 +1050,38 @@ static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) +static void vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, u32 sgi, bool allow_group1) +{ + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, sgi); + unsigned long flags; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* + * An access targeting Group0 SGIs can only generate + * those, while an access targeting Group1 SGIs can + * generate interrupts of either group. + */ + if (!irq->group || allow_group1) { + if (!irq->hw) { + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + /* HW SGI? Ask the GIC to inject it */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + true); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + } else { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + + vgic_put_irq(vcpu->kvm, irq); +} + /** * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs * @vcpu: The VCPU requesting a SGI @@ -1051,83 +1092,46 @@ static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) * This will trap in sys_regs.c and call this function. * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the * target processors as well as a bitmask of 16 Aff0 CPUs. - * If the interrupt routing mode bit is not set, we iterate over all VCPUs to - * check for matching ones. If this bit is set, we signal all, but not the - * calling VCPU. + * + * If the interrupt routing mode bit is not set, we iterate over the Aff0 + * bits and signal the VCPUs matching the provided Aff{3,2,1}. + * + * If this bit is set, we signal all, but not the calling VCPU. */ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) { struct kvm *kvm = vcpu->kvm; struct kvm_vcpu *c_vcpu; - u16 target_cpus; + unsigned long target_cpus; u64 mpidr; - int sgi; - int vcpu_id = vcpu->vcpu_id; - bool broadcast; - unsigned long c, flags; - - sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; - broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); - target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; - mpidr = SGI_AFFINITY_LEVEL(reg, 3); - mpidr |= SGI_AFFINITY_LEVEL(reg, 2); - mpidr |= SGI_AFFINITY_LEVEL(reg, 1); - - /* - * We iterate over all VCPUs to find the MPIDRs matching the request. - * If we have handled one CPU, we clear its bit to detect early - * if we are already finished. This avoids iterating through all - * VCPUs when most of the times we just signal a single VCPU. - */ - kvm_for_each_vcpu(c, c_vcpu, kvm) { - struct vgic_irq *irq; - - /* Exit early if we have dealt with all requested CPUs */ - if (!broadcast && target_cpus == 0) - break; - - /* Don't signal the calling VCPU */ - if (broadcast && c == vcpu_id) - continue; + u32 sgi, aff0; + unsigned long c; - if (!broadcast) { - int level0; + sgi = FIELD_GET(ICC_SGI1R_SGI_ID_MASK, reg); - level0 = match_mpidr(mpidr, target_cpus, c_vcpu); - if (level0 == -1) + /* Broadcast */ + if (unlikely(reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT))) { + kvm_for_each_vcpu(c, c_vcpu, kvm) { + /* Don't signal the calling VCPU */ + if (c_vcpu == vcpu) continue; - /* remove this matching VCPU from the mask */ - target_cpus &= ~BIT(level0); + vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1); } - irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); + return; + } - /* - * An access targeting Group0 SGIs can only generate - * those, while an access targeting Group1 SGIs can - * generate interrupts of either group. - */ - if (!irq->group || allow_group1) { - if (!irq->hw) { - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } else { - /* HW SGI? Ask the GIC to inject it */ - int err; - err = irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - true); - WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - } - } else { - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - } + /* We iterate over affinities to find the corresponding vcpus */ + mpidr = SGI_AFFINITY_LEVEL(reg, 3); + mpidr |= SGI_AFFINITY_LEVEL(reg, 2); + mpidr |= SGI_AFFINITY_LEVEL(reg, 1); + target_cpus = FIELD_GET(ICC_SGI1R_TARGET_LIST_MASK, reg); - vgic_put_irq(vcpu->kvm, irq); + for_each_set_bit(aff0, &target_cpus, hweight_long(ICC_SGI1R_TARGET_LIST_MASK)) { + c_vcpu = kvm_mpidr_to_vcpu(kvm, mpidr | aff0); + if (c_vcpu) + vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1); } } diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index b32d434c1d4a..a573b1f0c6cb 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -50,7 +50,7 @@ unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->group) value |= BIT(i); @@ -74,7 +74,7 @@ void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, unsigned long flags; for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->group = !!(val & BIT(i)); @@ -102,7 +102,7 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->enabled) value |= (1U << i); @@ -122,7 +122,7 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { @@ -171,7 +171,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled) @@ -193,7 +193,7 @@ int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = true; @@ -214,7 +214,7 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = false; @@ -236,7 +236,7 @@ static unsigned long __read_pending(struct kvm_vcpu *vcpu, /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); unsigned long flags; bool val; @@ -301,25 +301,32 @@ static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2); } -void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) +static void __set_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, + unsigned long val, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); - /* GICD_ISPENDR0 SGI bits are WI */ - if (is_vgic_v2_sgi(vcpu, irq)) { + /* GICD_ISPENDR0 SGI bits are WI when written from the guest. */ + if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { vgic_put_irq(vcpu->kvm, irq); continue; } raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* + * GICv2 SGIs are terribly broken. We can't restore + * the source of the interrupt, so just pick the vcpu + * itself as the source... + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source |= BIT(vcpu->vcpu_id); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* HW SGI? Ask the GIC to inject it */ int err; @@ -335,7 +342,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, } irq->pending_latch = true; - if (irq->hw) + if (irq->hw && !is_user) vgic_irq_set_phys_active(irq, true); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); @@ -343,33 +350,18 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, } } +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __set_pending(vcpu, addr, len, val, false); +} + int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - - /* - * GICv2 SGIs are terribly broken. We can't restore - * the source of the interrupt, so just pick the vcpu - * itself as the source... - */ - if (is_vgic_v2_sgi(vcpu, irq)) - irq->source |= BIT(vcpu->vcpu_id); - - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - + __set_pending(vcpu, addr, len, val, true); return 0; } @@ -394,25 +386,33 @@ static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq) vgic_irq_set_phys_active(irq, false); } -void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) +static void __clear_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); - /* GICD_ICPENDR0 SGI bits are WI */ - if (is_vgic_v2_sgi(vcpu, irq)) { + /* GICD_ICPENDR0 SGI bits are WI when written from the guest. */ + if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { vgic_put_irq(vcpu->kvm, irq); continue; } raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* + * More fun with GICv2 SGIs! If we're clearing one of them + * from userspace, which source vcpu to clear? Let's not + * even think of it, and blow the whole set. + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source = 0; + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* HW SGI? Ask the GIC to clear its pending bit */ int err; @@ -427,7 +427,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, continue; } - if (irq->hw) + if (irq->hw && !is_user) vgic_hw_irq_cpending(vcpu, irq); else irq->pending_latch = false; @@ -437,33 +437,18 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, } } +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __clear_pending(vcpu, addr, len, val, false); +} + int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - /* - * More fun with GICv2 SGIs! If we're clearing one of them - * from userspace, which source vcpu to clear? Let's not - * even think of it, and blow the whole set. - */ - if (is_vgic_v2_sgi(vcpu, irq)) - irq->source = 0; - - irq->pending_latch = false; - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - + __clear_pending(vcpu, addr, len, val, true); return 0; } @@ -473,9 +458,10 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, * active state can be overwritten when the VCPU's state is synced coming back * from the guest. * - * For shared interrupts as well as GICv3 private interrupts, we have to - * stop all the VCPUs because interrupts can be migrated while we don't hold - * the IRQ locks and we don't want to be chasing moving targets. + * For shared interrupts as well as GICv3 private interrupts accessed from the + * non-owning CPU, we have to stop all the VCPUs because interrupts can be + * migrated while we don't hold the IRQ locks and we don't want to be chasing + * moving targets. * * For GICv2 private interrupts we don't have to do anything because * userspace accesses to the VGIC state already require all VCPUs to be @@ -484,7 +470,8 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, */ static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) { - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && + vcpu != kvm_get_running_vcpu()) || intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_halt_guest(vcpu->kvm); } @@ -492,7 +479,8 @@ static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) /* See vgic_access_active_prepare */ static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid) { - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && + vcpu != kvm_get_running_vcpu()) || intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_resume_guest(vcpu->kvm); } @@ -506,7 +494,7 @@ static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu, /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* * Even for HW interrupts, don't evaluate the HW state as @@ -527,13 +515,13 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 val; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); val = __vgic_mmio_read_active(vcpu, addr, len); vgic_access_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); return val; } @@ -610,7 +598,7 @@ static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, int i; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, false); vgic_put_irq(vcpu->kvm, irq); } @@ -622,13 +610,13 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_cactive(vcpu, addr, len, val); vgic_access_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); } int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, @@ -647,7 +635,7 @@ static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, int i; for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, true); vgic_put_irq(vcpu->kvm, irq); } @@ -659,13 +647,13 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_sactive(vcpu, addr, len, val); vgic_access_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); } int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, @@ -684,7 +672,7 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, u64 val = 0; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); val |= (u64)irq->priority << (i * 8); @@ -710,7 +698,7 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, unsigned long flags; for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); /* Narrow the priority range to what we actually support */ @@ -731,7 +719,7 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, int i; for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->config == VGIC_CONFIG_EDGE) value |= (2U << (i * 2)); @@ -762,7 +750,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, if (intid + i < VGIC_NR_PRIVATE_IRQS) continue; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + irq = vgic_get_irq(vcpu->kvm, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (test_bit(i * 2 + 1, &val)) @@ -787,7 +775,7 @@ u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) continue; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) val |= (1U << i); @@ -811,7 +799,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) continue; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + irq = vgic_get_vcpu_irq(vcpu, intid + i); /* * Line level is set irrespective of irq type @@ -1093,7 +1081,6 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type type) { struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; - int ret = 0; unsigned int len; switch (type) { @@ -1104,17 +1091,13 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, len = vgic_v3_init_dist_iodev(io_device); break; default: - BUG_ON(1); + BUG(); } io_device->base_addr = dist_base_address; io_device->iodev_type = IODEV_DIST; io_device->redist_vcpu = NULL; - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, - len, &io_device->dev); - mutex_unlock(&kvm->slots_lock); - - return ret; + return kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, + len, &io_device->dev); } diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h index 5b490a4dfa5e..50dc80220b0f 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.h +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -213,6 +213,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, const u32 val); unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev); unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 645648349c99..585491fbda80 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -9,6 +9,7 @@ #include <kvm/arm_vgic.h> #include <asm/kvm_mmu.h> +#include "vgic-mmio.h" #include "vgic.h" static inline void vgic_v2_write_lr(int lr, u32 val) @@ -26,11 +27,24 @@ void vgic_v2_init_lrs(void) vgic_v2_write_lr(i, 0); } -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - cpuif->vgic_hcr |= GICH_HCR_UIE; + cpuif->vgic_hcr = GICH_HCR_EN; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_UIE; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP0_MASK) ? + GICH_HCR_VGrp0DIE : GICH_HCR_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP1_MASK) ? + GICH_HCR_VGrp1DIE : GICH_HCR_VGrp1EIE; } static bool lr_signals_eoi_mi(u32 lr_val) @@ -39,43 +53,23 @@ static bool lr_signals_eoi_mi(u32 lr_val) !(lr_val & GICH_LR_HW); } -/* - * transfer the content of the LRs back into the corresponding ap_list: - * - active bit is transferred as is - * - pending bit is - * - transferred as is in case of edge sensitive IRQs - * - set to the line-level (resample time) for level sensitive IRQs - */ -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; - int lr; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - - cpuif->vgic_hcr &= ~GICH_HCR_UIE; - - for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) { - u32 val = cpuif->vgic_lr[lr]; - u32 cpuid, intid = val & GICH_LR_VIRTUALID; - struct vgic_irq *irq; - bool deactivated; - - /* Extract the source vCPU id from the LR */ - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - cpuid &= 7; + u32 cpuid, intid = val & GICH_LR_VIRTUALID; + struct vgic_irq *irq; + bool deactivated; - /* Notify fds when the guest EOI'ed a level-triggered SPI */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); + /* Extract the source vCPU id from the LR */ + cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val) & 7; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + /* Notify fds when the guest EOI'ed a level-triggered SPI */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); - raw_spin_lock(&irq->irq_lock); + irq = vgic_get_vcpu_irq(vcpu, intid); + scoped_guard(raw_spinlock, &irq->irq_lock) { /* Always preserve the active bit, note deactivation */ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); irq->active = !!(val & GICH_LR_ACTIVE_BIT); @@ -101,29 +95,139 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); + irq->on_lr = false; } - cpuif->used_lrs = 0; + vgic_put_irq(vcpu->kvm, irq); } +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + /* - * Populates the particular LR with the state of a given IRQ: - * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq - * - for a level sensitive IRQ the pending state value is unchanged; - * it is dictated directly by the input level - * - * If @irq describes an SGI with multiple sources, we choose the - * lowest-numbered source VCPU and clear that bit in the source bitmap. - * - * The irq_lock must be held by the caller. + * transfer the content of the LRs back into the corresponding ap_list: + * - active bit is transferred as is + * - pending bit is + * - transferred as is in case of edge sensitive IRQs + * - set to the line-level (resample time) for level sensitive IRQs */ -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr); + struct vgic_irq *irq; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) + vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]); + + /* See the GICv3 equivalent for the EOIcount handling rationale */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u32 lr; + + if (!eoicount) { + break; + } else { + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + vgic_v2_fold_lr(vcpu, lr); + eoicount--; + } + + cpuif->used_lrs = 0; +} + +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false; + struct vgic_irq *irq; + unsigned long flags; + u64 lr = 0; + u8 cpuid; + + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); + + /* We only deal with DIR when EOIMode==1 */ + if (!(cpuif->vgic_vmcr & GICH_VMCR_EOI_MODE_MASK)) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* See the corresponding v3 code for the rationale */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; + + /* + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. + */ + if (irq->on_lr) { + mmio = true; + goto put; + } + + /* SGI: check that the cpuid matches */ + if (val < VGIC_NR_SGIS && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } + + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + + vgic_v2_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); +} + +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 val = irq->intid; bool allow_pending = true; + WARN_ON(irq->on_lr); + if (irq->active) { val |= GICH_LR_ACTIVE_BIT; if (vgic_irq_is_sgi(irq->intid)) @@ -163,22 +267,52 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (allow_pending && irq_is_pending(irq)) { val |= GICH_LR_PENDING_BIT; - if (irq->config == VGIC_CONFIG_EDGE) - irq->pending_latch = false; - if (vgic_irq_is_sgi(irq->intid)) { u32 src = ffs(irq->source); if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", irq->intid)) - return; + return 0; val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { - irq->pending_latch = true; + if (irq->source & ~BIT(src - 1)) val |= GICH_LR_EOI; - } + } + } + + /* The GICv2 LR only holds five bits of priority. */ + val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + + return val; +} + +/* + * Populates the particular LR with the state of a given IRQ: + * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq + * - for a level sensitive IRQ the pending state value is unchanged; + * it is dictated directly by the input level + * + * If @irq describes an SGI with multiple sources, we choose the + * lowest-numbered source VCPU and clear that bit in the source bitmap. + * + * The irq_lock must be held by the caller. + */ +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 val = vgic_v2_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; + + if (val & GICH_LR_PENDING_BIT) { + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch = false; + + if (vgic_irq_is_sgi(irq->intid)) { + u32 src = ffs(irq->source); + + irq->source &= ~BIT(src - 1); + if (irq->source) + irq->pending_latch = true; } } @@ -194,7 +328,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) /* The GICv2 LR only holds five bits of priority. */ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; + irq->on_lr = true; } void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) @@ -257,7 +391,7 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; } -void vgic_v2_enable(struct kvm_vcpu *vcpu) +void vgic_v2_reset(struct kvm_vcpu *vcpu) { /* * By forcing VMCR to zero, the GIC will restore the binary @@ -265,9 +399,6 @@ void vgic_v2_enable(struct kvm_vcpu *vcpu) * anyway. */ vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; - - /* Get the show on the road... */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; } /* check for overlapping regions and for regions crossing the end of memory */ @@ -289,6 +420,7 @@ static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) int vgic_v2_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned int len; int ret = 0; if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || @@ -312,16 +444,20 @@ int vgic_v2_map_resources(struct kvm *kvm) return ret; } - ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2); - if (ret) { - kvm_err("Unable to register VGIC MMIO regions\n"); + len = vgic_v2_init_cpuif_iodev(&dist->cpuif_iodev); + dist->cpuif_iodev.base_addr = dist->vgic_cpu_base; + dist->cpuif_iodev.iodev_type = IODEV_CPUIF; + dist->cpuif_iodev.redist_vcpu = NULL; + + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist->vgic_cpu_base, + len, &dist->cpuif_iodev.dev); + if (ret) return ret; - } if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, kvm_vgic_global_state.vcpu_base, - KVM_VGIC_V2_CPU_SIZE, true); + KVM_VGIC_V2_CPU_SIZE - SZ_4K, true); if (ret) { kvm_err("Unable to remap VGIC CPU to VCPU\n"); return ret; @@ -391,6 +527,7 @@ int vgic_v2_probe(const struct gic_kvm_info *info) kvm_vgic_global_state.can_emulate_gicv2 = true; kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.gicc_base = info->gicc_base; kvm_vgic_global_state.type = VGIC_V2; kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; @@ -429,16 +566,26 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) void vgic_v2_save_state(struct kvm_vcpu *vcpu) { + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; void __iomem *base = kvm_vgic_global_state.vctrl_base; u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; if (!base) return; - if (used_lrs) { + cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); + + if (used_lrs) save_lrs(vcpu, base); - writel_relaxed(0, base + GICH_HCR); + + if (cpu_if->vgic_hcr & GICH_HCR_LRENPIE) { + u32 val = readl_relaxed(base + GICH_HCR); + + cpu_if->vgic_hcr &= ~GICH_HCR_EOICOUNT; + cpu_if->vgic_hcr |= val & GICH_HCR_EOICOUNT; } + + writel_relaxed(0, base + GICH_HCR); } void vgic_v2_restore_state(struct kvm_vcpu *vcpu) @@ -451,13 +598,10 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu) if (!base) return; - if (used_lrs) { - writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); - for (i = 0; i < used_lrs; i++) { - writel_relaxed(cpu_if->vgic_lr[i], - base + GICH_LR0 + (i * 4)); - } - } + writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); + + for (i = 0; i < used_lrs; i++) + writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4)); } void vgic_v2_load(struct kvm_vcpu *vcpu) @@ -470,17 +614,9 @@ void vgic_v2_load(struct kvm_vcpu *vcpu) kvm_vgic_global_state.vctrl_base + GICH_APR); } -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - - cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); -} - void vgic_v2_put(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - vgic_v2_vmcr_sync(vcpu); cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); } diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c new file mode 100644 index 000000000000..61b44f3f2bf1 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/uaccess.h> + +#include <kvm/arm_vgic.h> + +#include <asm/kvm_arm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> + +#include "vgic.h" + +#define ICH_LRN(n) (ICH_LR0_EL2 + (n)) +#define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n)) +#define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n)) + +struct mi_state { + u16 eisr; + u16 elrsr; + bool pend; +}; + +/* + * The shadow registers loaded to the hardware when running a L2 guest + * with the virtual IMO/FMO bits set. + */ +struct shadow_if { + struct vgic_v3_cpu_if cpuif; + unsigned long lr_map; +}; + +static DEFINE_PER_CPU(struct shadow_if, shadow_if); + +static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) +{ + return hweight16(shadow_if->lr_map & (BIT(idx) - 1)); +} + +/* + * Nesting GICv3 support + * + * On a non-nesting VM (only running at EL0/EL1), the host hypervisor + * completely controls the interrupts injected via the list registers. + * Consequently, most of the state that is modified by the guest (by ACK-ing + * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we + * keep a semi-consistent view of the interrupts. + * + * This still applies for a NV guest, but only while "InHost" (either + * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}. + * + * When running a L2 guest ("not InHost"), things are radically different, + * as the L1 guest is in charge of provisioning the interrupts via its own + * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR + * page. This means that the flow described above does work (there is no + * state to rebuild in the L0 hypervisor), and that most things happed on L2 + * load/put: + * + * - on L2 load: move the in-memory L1 vGIC configuration into a shadow, + * per-CPU data structure that is used to populate the actual LRs. This is + * an extra copy that we could avoid, but life is short. In the process, + * we remap any interrupt that has the HW bit set to the mapped interrupt + * on the host, should the host consider it a HW one. This allows the HW + * deactivation to take its course, such as for the timer. + * + * - on L2 put: perform the inverse transformation, so that the result of L2 + * running becomes visible to L1 in the VNCR-accessible registers. + * + * - there is nothing to do on L2 entry apart from enabling the vgic, as + * everything will have happened on load. However, this is the point where + * we detect that an interrupt targeting L1 and prepare the grand + * switcheroo. + * + * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate + * corresponding the L1 interrupt. The L0 active state will be cleared by + * the HW if the L1 interrupt was itself backed by a HW interrupt. + * + * Maintenance Interrupt (MI) management: + * + * Since the L2 guest runs the vgic in its full glory, MIs get delivered and + * used as a handover point between L2 and L1. + * + * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending, + * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to + * run and process the MI. + * + * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its + * state must be computed at each entry/exit of the guest, much like we do + * it for the PMU interrupt. + * + * - because most of the ICH_*_EL2 registers live in the VNCR page, the + * quality of emulation is poor: L1 can setup the vgic so that an MI would + * immediately fire, and not observe anything until the next exit. + * Similarly, a pending MI is not immediately disabled by clearing + * ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for + * example. + * + * System register emulation: + * + * We get two classes of registers: + * + * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access + * them, and L0 doesn't see a thing. + * + * - those that always trap (ELRSR, EISR, MISR): these are status registers + * that are built on the fly based on the in-memory state. + * + * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot, + * and a NV L2 would either access the VNCR page provided by L1 (memory + * based registers), or see the access redirected to L1 (registers that + * trap) thanks to NV being set by L1. + */ + +bool vgic_state_is_nested(struct kvm_vcpu *vcpu) +{ + u64 xmo; + + if (is_nested_ctxt(vcpu)) { + xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO); + WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO), + "Separate virtual IRQ/FIQ settings not supported\n"); + + return !!xmo; + } + + return false; +} + +static struct shadow_if *get_shadow_if(void) +{ + return this_cpu_ptr(&shadow_if); +} + +static bool lr_triggers_eoi(u64 lr) +{ + return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI); +} + +static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state) +{ + u16 eisr = 0, elrsr = 0; + bool pend = false; + + for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + + if (lr_triggers_eoi(lr)) + eisr |= BIT(i); + if (!(lr & ICH_LR_STATE)) + elrsr |= BIT(i); + pend |= (lr & ICH_LR_PENDING_BIT); + } + + mi_state->eisr = eisr; + mi_state->elrsr = elrsr; + mi_state->pend = pend; +} + +u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + + vgic_compute_mi_state(vcpu, &mi_state); + return mi_state.eisr; +} + +u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + + vgic_compute_mi_state(vcpu, &mi_state); + return mi_state.elrsr; +} + +u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) +{ + struct mi_state mi_state; + u64 reg = 0, hcr, vmcr; + + hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); + + vgic_compute_mi_state(vcpu, &mi_state); + + if (mi_state.eisr) + reg |= ICH_MISR_EL2_EOI; + + if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) { + int used_lrs = kvm_vgic_global_state.nr_lr; + + used_lrs -= hweight16(mi_state.elrsr); + reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0; + } + + if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr)) + reg |= ICH_MISR_EL2_LRENP; + + if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend) + reg |= ICH_MISR_EL2_NP; + + if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK)) + reg |= ICH_MISR_EL2_VGrp0E; + + if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK)) + reg |= ICH_MISR_EL2_VGrp0D; + + if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK)) + reg |= ICH_MISR_EL2_VGrp1E; + + if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK)) + reg |= ICH_MISR_EL2_VGrp1D; + + return reg; +} + +static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr) +{ + struct vgic_irq *irq; + + if (!(lr & ICH_LR_HW)) + return lr; + + /* We have the HW bit set, check for validity of pINTID */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + /* If there was no real mapping, nuke the HW bit */ + if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI) + lr &= ~ICH_LR_HW; + + /* Translate the virtual mapping to the real one, even if invalid */ + if (irq) { + lr &= ~ICH_LR_PHYS_ID_MASK; + lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + vgic_put_irq(vcpu->kvm, irq); + } + + return lr; +} + +/* + * For LRs which have HW bit set such as timer interrupts, we modify them to + * have the host hardware interrupt number instead of the virtual one programmed + * by the guest hypervisor. + */ +static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, + struct vgic_v3_cpu_if *s_cpu_if) +{ + struct shadow_if *shadow_if; + + shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif); + shadow_if->lr_map = 0; + + for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { + u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + + if (!(lr & ICH_LR_STATE)) + continue; + + lr = translate_lr_pintid(vcpu, lr); + + s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr; + shadow_if->lr_map |= BIT(i); + } + + s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); +} + +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2); +} + +void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + int i; + + for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { + u64 val, host_lr, lr; + + host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); + + /* Propagate the new LR state */ + lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + val = lr & ~ICH_LR_STATE; + val |= host_lr & ICH_LR_STATE; + __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); + + /* + * Deactivation of a HW interrupt: the LR must have the HW + * bit set, have been in a non-invalid state before the run, + * and now be in an invalid state. If any of that doesn't + * hold, we're done with this LR. + */ + if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) && + !(host_lr & ICH_LR_STATE))) + continue; + + /* + * If we had a HW lr programmed by the guest hypervisor, we + * need to emulate the HW effect between the guest hypervisor + * and the nested guest. + */ + vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + } + + /* We need these to be synchronised to generate the MI */ + __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2)); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount); + + write_sysreg_s(0, SYS_ICH_HCR_EL2); + isb(); + + vgic_v3_nested_update_mi(vcpu); +} + +static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, + struct vgic_v3_cpu_if *s_cpu_if) +{ + struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; + int i; + + s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); + s_cpu_if->vgic_sre = host_if->vgic_sre; + + for (i = 0; i < 4; i++) { + s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i)); + s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i)); + } + + vgic_v3_create_shadow_lr(vcpu, s_cpu_if); +} + +void vgic_v3_load_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif; + + BUG_ON(!vgic_state_is_nested(vcpu)); + + vgic_v3_create_shadow_state(vcpu, cpu_if); + + __vgic_v3_restore_vmcr_aprs(cpu_if); + __vgic_v3_activate_traps(cpu_if); + + for (int i = 0; i < cpu_if->used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); + + /* + * Propagate the number of used LRs for the benefit of the HYP + * GICv3 emulation code. Yes, this is a pretty sorry hack. + */ + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs; +} + +void vgic_v3_put_nested(struct kvm_vcpu *vcpu) +{ + struct shadow_if *shadow_if = get_shadow_if(); + struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; + int i; + + __vgic_v3_save_aprs(s_cpu_if); + + for (i = 0; i < 4; i++) { + __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]); + __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]); + } + + for (i = 0; i < s_cpu_if->used_lrs; i++) + __gic_v3_set_lr(0, i); + + __vgic_v3_deactivate_traps(s_cpu_if); + + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; +} + +/* + * If we exit a L2 VM with a pending maintenance interrupt from the GIC, + * then we need to forward this to L1 so that it can re-sync the appropriate + * LRs and sample level triggered interrupts again. + */ +void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu) +{ + bool state = read_sysreg_s(SYS_ICH_MISR_EL2); + + /* This will force a switch back to L1 if the level is high */ + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + vcpu->kvm->arch.vgic.mi_intid, state, vcpu); + + sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); +} + +void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu) +{ + bool level; + + level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu); + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + vcpu->kvm->arch.vgic.mi_intid, level, vcpu); +} diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 2624963cb95b..1d6dd1b545bd 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -3,13 +3,16 @@ #include <linux/irqchip/arm-gic-v3.h> #include <linux/irq.h> #include <linux/irqdomain.h> +#include <linux/kstrtox.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/string_choices.h> #include <kvm/arm_vgic.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/kvm_asm.h> +#include "vgic-mmio.h" #include "vgic.h" static bool group0_trap; @@ -18,11 +21,48 @@ static bool common_trap; static bool dir_trap; static bool gicv4_enable; -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - cpuif->vgic_hcr |= ICH_HCR_UIE; + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + cpuif->vgic_hcr = ICH_HCR_EL2_En; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; + + if (!als->nr_sgi) + cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG0_MASK) ? + ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ? + ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; + + /* + * Dealing with EOImode=1 is a massive source of headache. Not + * only do we need to track that we have active interrupts + * outside of the LRs and force DIR to be trapped, we also + * need to deal with SPIs that can be deactivated on another + * CPU. + * + * On systems that do not implement TDIR, force the bit in the + * shadow state anyway to avoid IPI-ing on these poor sods. + * + * Note that we set the trap irrespective of EOIMode, as that + * can change behind our back without any warning... + */ + if (!cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) || + irqs_active_outside_lrs(als) || + atomic_read(&vcpu->kvm->arch.vgic.active_spis)) + cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR; } static bool lr_signals_eoi_mi(u64 lr_val) @@ -31,84 +71,238 @@ static bool lr_signals_eoi_mi(u64 lr_val) !(lr_val & ICH_LR_HW); } +static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) +{ + struct vgic_irq *irq; + bool is_v2_sgi = false; + bool deactivated; + u32 intid; + + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + intid = val & ICH_LR_VIRTUAL_ID_MASK; + } else { + intid = val & GICH_LR_VIRTUALID; + is_v2_sgi = vgic_irq_is_sgi(intid); + } + + irq = vgic_get_vcpu_irq(vcpu, intid); + if (!irq) /* An LPI could have been unmapped. */ + return; + + scoped_guard(raw_spinlock, &irq->irq_lock) { + /* Always preserve the active bit for !LPIs, note deactivation */ + if (irq->intid >= VGIC_MIN_LPI) + val &= ~ICH_LR_ACTIVE_BIT; + deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); + irq->active = !!(val & ICH_LR_ACTIVE_BIT); + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & ICH_LR_PENDING_BIT)) + irq->pending_latch = true; + + /* + * Clear soft pending state when level irqs have been acked. + */ + if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) + irq->pending_latch = false; + + if (is_v2_sgi) { + u8 cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val); + + if (irq->active) + irq->active_source = cpuid; + + if (val & ICH_LR_PENDING_BIT) + irq->source |= BIT(cpuid); + } + + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); + + irq->on_lr = false; + } + + /* Notify fds when the guest EOI'ed a level-triggered SPI, and drop the refcount */ + if (deactivated && lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) { + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + atomic_dec_if_positive(&vcpu->kvm->arch.vgic.active_spis); + } + + vgic_put_irq(vcpu->kvm, irq); +} + +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + +static void vgic_v3_deactivate_phys(u32 intid) +{ + if (cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) + gic_insn(intid | FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, 1), CDDI); + else + gic_write_dir(intid); +} + void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - int lr; + u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr); + struct vgic_irq *irq; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - cpuif->vgic_hcr &= ~ICH_HCR_UIE; - - for (lr = 0; lr < cpuif->used_lrs; lr++) { - u64 val = cpuif->vgic_lr[lr]; - u32 intid, cpuid; - struct vgic_irq *irq; - bool is_v2_sgi = false; - bool deactivated; + for (int lr = 0; lr < cpuif->used_lrs; lr++) + vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]); - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; + /* + * EOIMode=0: use EOIcount to emulate deactivation. We are + * guaranteed to deactivate in reverse order of the activation, so + * just pick one active interrupt after the other in the ap_list, + * and replay the deactivation as if the CPU was doing it. We also + * rely on priority drop to have taken place, and the list to be + * sorted by priority. + */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u64 lr; - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { - intid = val & ICH_LR_VIRTUAL_ID_MASK; + /* + * I would have loved to write this using a scoped_guard(), + * but using 'continue' here is a total train wreck. + */ + if (!eoicount) { + break; } else { - intid = val & GICH_LR_VIRTUALID; - is_v2_sgi = vgic_irq_is_sgi(intid); + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; } - /* Notify fds when the guest EOI'ed a level-triggered IRQ */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - if (!irq) /* An LPI could have been unmapped. */ - continue; + vgic_v3_fold_lr(vcpu, lr); + eoicount--; + } - raw_spin_lock(&irq->irq_lock); + cpuif->used_lrs = 0; +} - /* Always preserve the active bit, note deactivation */ - deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); - irq->active = !!(val & ICH_LR_ACTIVE_BIT); +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false, is_v2_sgi; + struct vgic_irq *irq; + unsigned long flags; + u64 lr = 0; + u8 cpuid; - if (irq->active && is_v2_sgi) - irq->active_source = cpuid; + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); - /* Edge is the only case where we preserve the pending bit */ - if (irq->config == VGIC_CONFIG_EDGE && - (val & ICH_LR_PENDING_BIT)) { - irq->pending_latch = true; + is_v2_sgi = (model == KVM_DEV_TYPE_ARM_VGIC_V2 && + val < VGIC_NR_SGIS); - if (is_v2_sgi) - irq->source |= (1 << cpuid); - } + /* + * We only deal with DIR when EOIMode==1, and only for SGI, + * PPI or SPI. + */ + if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) || + val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* + * EOIMode=1: we must rely on traps to handle deactivate of + * overflowing interrupts, as there is no ordering guarantee and + * EOIcount isn't being incremented. Priority drop will have taken + * place, as ICV_EOIxR_EL1 only affects the APRs and not the LRs. + * + * Three possibities: + * + * - The irq is not queued on any CPU, and there is nothing to + * do, + * + * - Or the irq is in an LR, meaning that its state is not + * directly observable. Treat it bluntly by making it as if + * this was a write to GICD_ICACTIVER, which will force an + * exit on all vcpus. If it hurts, don't do that. + * + * - Or the irq is active, but not in an LR, and we can + * directly deactivate it by building a pseudo-LR, fold it, + * and queue a request to prune the resulting ap_list, + * + * Special care must be taken to match the source CPUID when + * deactivating a GICv2 SGI. + */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; /* - * Clear soft pending state when level irqs have been acked. + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. */ - if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) - irq->pending_latch = false; + if (irq->on_lr) { + mmio = true; + goto put; + } - /* Handle resampling for mapped interrupts if required */ - vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); + /* GICv2 SGI: check that the cpuid matches */ + if (is_v2_sgi && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; } - cpuif->used_lrs = 0; + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + + vgic_v3_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); } /* Requires the irq to be locked already */ -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 model = vcpu->kvm->arch.vgic.vgic_model; u64 val = irq->intid; bool allow_pending = true, is_v2_sgi; + WARN_ON(irq->on_lr); + is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2); @@ -148,6 +342,35 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (allow_pending && irq_is_pending(irq)) { val |= ICH_LR_PENDING_BIT; + if (is_v2_sgi) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return 0; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + if (irq->source & ~BIT(src - 1)) + val |= ICH_LR_EOI; + } + } + + if (irq->group) + val |= ICH_LR_GROUP; + + val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; + + return val; +} + +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u64 val = vgic_v3_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + + if (val & ICH_LR_PENDING_BIT) { if (irq->config == VGIC_CONFIG_EDGE) irq->pending_latch = false; @@ -155,16 +378,9 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) model == KVM_DEV_TYPE_ARM_VGIC_V2) { u32 src = ffs(irq->source); - if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", - irq->intid)) - return; - - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { + irq->source &= ~BIT(src - 1); + if (irq->source) irq->pending_latch = true; - val |= ICH_LR_EOI; - } } } @@ -177,12 +393,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) irq->line_level = false; - if (irq->group) - val |= ICH_LR_GROUP; - - val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + irq->on_lr = true; } void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) @@ -256,7 +467,7 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) -void vgic_v3_enable(struct kvm_vcpu *vcpu) +void vgic_v3_reset(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; @@ -282,23 +493,24 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) vgic_v3->vgic_sre = 0; } - vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_ID_BITS_MASK) >> - ICH_VTR_ID_BITS_SHIFT; - vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_PRI_BITS_MASK) >> - ICH_VTR_PRI_BITS_SHIFT) + 1; + vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits, + kvm_vgic_global_state.ich_vtr_el2); + vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits, + kvm_vgic_global_state.ich_vtr_el2) + 1; +} - /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EN; - if (group0_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL0; - if (group1_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TALL1; - if (common_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TC; - if (dir_trap) - vgic_v3->vgic_hcr |= ICH_HCR_TDIR; +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (!vgic_is_v3(vcpu->kvm)) + return; + + /* Hide GICv3 sysreg if necessary */ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 || + !irqchip_in_kernel(vcpu->kvm)) + vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | + ICH_HCR_EL2_TC); } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) @@ -339,7 +551,7 @@ retry: if (status) { /* clear consumed data */ val &= ~(1 << bit_nr); - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; } @@ -369,7 +581,7 @@ static void map_all_vpes(struct kvm *kvm) dist->its_vm.vpes[i]->irq)); } -/** +/* * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held */ @@ -379,6 +591,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) struct vgic_irq *irq; gpa_t last_ptr = ~(gpa_t)0; bool vlpi_avail = false; + unsigned long index; int ret = 0; u8 val; @@ -395,7 +608,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) vlpi_avail = true; } - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + xa_for_each(&dist->lpi_xa, index, irq) { int byte_offset, bit_nr; struct kvm_vcpu *vcpu; gpa_t pendbase, ptr; @@ -434,7 +647,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) else val &= ~(1 << bit_nr); - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; } @@ -538,7 +751,6 @@ int vgic_v3_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int ret = 0; unsigned long c; kvm_for_each_vcpu(c, vcpu, kvm) { @@ -568,12 +780,6 @@ int vgic_v3_map_resources(struct kvm *kvm) return -EBUSY; } - ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3); - if (ret) { - kvm_err("Unable to register VGICv3 dist MMIO regions\n"); - return ret; - } - if (kvm_vgic_global_state.has_gicv4_1) vgic_v4_configure_vsgis(kvm); @@ -581,28 +787,29 @@ int vgic_v3_map_resources(struct kvm *kvm) } DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); +DEFINE_STATIC_KEY_FALSE(vgic_v3_has_v2_compat); static int __init early_group0_trap_cfg(char *buf) { - return strtobool(buf, &group0_trap); + return kstrtobool(buf, &group0_trap); } early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); static int __init early_group1_trap_cfg(char *buf) { - return strtobool(buf, &group1_trap); + return kstrtobool(buf, &group1_trap); } early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); static int __init early_common_trap_cfg(char *buf) { - return strtobool(buf, &common_trap); + return kstrtobool(buf, &common_trap); } early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); static int __init early_gicv4_enable(char *buf) { - return strtobool(buf, &gicv4_enable); + return kstrtobool(buf, &gicv4_enable); } early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); @@ -615,13 +822,62 @@ static const struct midr_range broken_seis[] = { MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), {}, }; static bool vgic_v3_broken_seis(void) { - return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) && - is_midr_in_range_list(read_cpuid_id(), broken_seis)); + return (is_kernel_in_hyp_mode() && + is_midr_in_range_list(broken_seis) && + (read_sysreg_s(SYS_ICH_VTR_EL2) & ICH_VTR_EL2_SEIS)); +} + +void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, + int nr_inst) +{ + u32 insn, oinsn, rd; + u64 hcr = 0; + + if (cpus_have_cap(ARM64_WORKAROUND_CAVIUM_30115)) { + group0_trap = true; + group1_trap = true; + } + + if (vgic_v3_broken_seis()) { + /* We know that these machines have ICH_HCR_EL2.TDIR */ + group0_trap = true; + group1_trap = true; + dir_trap = true; + } + + if (!cpus_have_cap(ARM64_HAS_ICH_HCR_EL2_TDIR)) + common_trap = true; + + if (group0_trap) + hcr |= ICH_HCR_EL2_TALL0; + if (group1_trap) + hcr |= ICH_HCR_EL2_TALL1; + if (common_trap) + hcr |= ICH_HCR_EL2_TC; + if (dir_trap) + hcr |= ICH_HCR_EL2_TDIR; + + /* Compute target register */ + oinsn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn); + + /* movz rd, #(val & 0xffff) */ + insn = aarch64_insn_gen_movewide(rd, + (u16)hcr, + 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr = cpu_to_le32(insn); } /** @@ -635,6 +891,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info) { u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); bool has_v2; + u64 traps; int ret; has_v2 = ich_vtr_el2 >> 63; @@ -652,9 +909,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (info->has_v4) { kvm_vgic_global_state.has_gicv4 = gicv4_enable; kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; - kvm_info("GICv4%s support %sabled\n", + kvm_info("GICv4%s support %s\n", kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", - gicv4_enable ? "en" : "dis"); + str_enabled_disabled(gicv4_enable)); } kvm_vgic_global_state.vcpu_base = 0; @@ -686,29 +943,25 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (kvm_vgic_global_state.vcpu_base == 0) kvm_info("disabling GICv2 emulation\n"); - if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) { - group0_trap = true; - group1_trap = true; - } + /* + * Flip the static branch if the HW supports v2, even if we're + * not using it (such as in protected mode). + */ + if (has_v2) + static_branch_enable(&vgic_v3_has_v2_compat); if (vgic_v3_broken_seis()) { kvm_info("GICv3 with broken locally generated SEI\n"); - - kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK; - group0_trap = true; - group1_trap = true; - if (ich_vtr_el2 & ICH_VTR_TDS_MASK) - dir_trap = true; - else - common_trap = true; + kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; } - if (group0_trap || group1_trap || common_trap | dir_trap) { + traps = vgic_ich_hcr_trap_bits(); + if (traps) { kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", - group0_trap ? "G0" : "", - group1_trap ? "G1" : "", - common_trap ? "C" : "", - dir_trap ? "D" : ""); + (traps & ICH_HCR_EL2_TALL0) ? "G0" : "", + (traps & ICH_HCR_EL2_TALL1) ? "G1" : "", + (traps & ICH_HCR_EL2_TC) ? "C" : "", + (traps & ICH_HCR_EL2_TDIR) ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } @@ -723,15 +976,14 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - /* - * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen - * is dependent on ICC_SRE_EL1.SRE, and we have to perform the - * VMCR_EL2 save/restore in the world switch. - */ - if (likely(cpu_if->vgic_sre)) - kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); + /* If the vgic is nested, perform the full state loading */ + if (vgic_state_is_nested(vcpu)) { + vgic_v3_load_nested(vcpu); + return; + } - kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); @@ -739,23 +991,18 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) WARN_ON(vgic_v4_load(vcpu)); } -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - if (likely(cpu_if->vgic_sre)) - cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); -} - void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - WARN_ON(vgic_v4_put(vcpu, false)); - - vgic_v3_vmcr_sync(vcpu); + if (vgic_state_is_nested(vcpu)) { + vgic_v3_put_nested(vcpu); + return; + } - kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); + WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index a413718be92b..09c3e9eb23f8 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -123,7 +123,7 @@ static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu) * IRQ. The SGI code will do its magic. */ for (i = 0; i < VGIC_NR_SGIS; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); struct irq_desc *desc; unsigned long flags; int ret; @@ -160,9 +160,10 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) int i; for (i = 0; i < VGIC_NR_SGIS; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); struct irq_desc *desc; unsigned long flags; + bool pending; int ret; raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -173,9 +174,11 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) irq->hw = false; ret = irq_get_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, - &irq->pending_latch); + &pending); WARN_ON(ret); + irq->pending_latch = pending; + desc = irq_to_desc(irq->host_irq); irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); unlock: @@ -184,13 +187,14 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) } } -/* Must be called with the kvm lock held */ void vgic_v4_configure_vsgis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; unsigned long i; + lockdep_assert_held(&kvm->arch.config_lock); + kvm_arm_halt_guest(kvm); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -232,9 +236,8 @@ int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq) * @kvm: Pointer to the VM being initialized * * We may be called each time a vITS is created, or when the - * vgic is initialized. This relies on kvm->lock to be - * held. In both cases, the number of vcpus should now be - * fixed. + * vgic is initialized. In both cases, the number of vcpus + * should now be fixed. */ int vgic_v4_init(struct kvm *kvm) { @@ -243,6 +246,8 @@ int vgic_v4_init(struct kvm *kvm) int nr_vcpus, ret; unsigned long i; + lockdep_assert_held(&kvm->arch.config_lock); + if (!kvm_vgic_global_state.has_gicv4) return 0; /* Nothing to see here... move along. */ @@ -309,14 +314,14 @@ int vgic_v4_init(struct kvm *kvm) /** * vgic_v4_teardown - Free the GICv4 data structures * @kvm: Pointer to the VM being destroyed - * - * Relies on kvm->lock to be held. */ void vgic_v4_teardown(struct kvm *kvm) { struct its_vm *its_vm = &kvm->arch.vgic.its_vm; int i; + lockdep_assert_held(&kvm->arch.config_lock); + if (!its_vm->vpes) return; @@ -334,14 +339,30 @@ void vgic_v4_teardown(struct kvm *kvm) its_vm->vpes = NULL; } -int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db) +static inline bool vgic_v4_want_doorbell(struct kvm_vcpu *vcpu) +{ + if (vcpu_get_flag(vcpu, IN_WFI)) + return true; + + if (likely(!vcpu_has_nv(vcpu))) + return false; + + /* + * GICv4 hardware is only ever used for the L1. Mark the vPE (i.e. the + * L1 context) nonresident and request a doorbell to kick us out of the + * L2 when an IRQ becomes pending. + */ + return vcpu_get_flag(vcpu, IN_NESTED_ERET); +} + +int vgic_v4_put(struct kvm_vcpu *vcpu) { struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident) + if (!vgic_supports_direct_irqs(vcpu->kvm) || !vpe->resident) return 0; - return its_make_vpe_non_resident(vpe, need_db); + return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu)); } int vgic_v4_load(struct kvm_vcpu *vcpu) @@ -349,7 +370,10 @@ int vgic_v4_load(struct kvm_vcpu *vcpu) struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; int err; - if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident) + if (!vgic_supports_direct_irqs(vcpu->kvm) || vpe->resident) + return 0; + + if (vcpu_get_flag(vcpu, IN_WFI)) return 0; /* @@ -410,7 +434,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, struct vgic_irq *irq; struct its_vlpi_map map; unsigned long flags; - int ret; + int ret = 0; if (!vgic_supports_direct_msis(kvm)) return 0; @@ -423,13 +447,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, if (IS_ERR(its)) return 0; - mutex_lock(&its->its_lock); + guard(mutex)(&its->its_lock); - /* Perform the actual DevID/EventID -> LPI translation. */ - ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, - irq_entry->msi.data, &irq); - if (ret) - goto out; + /* + * Perform the actual DevID/EventID -> LPI translation. + * + * Silently exit if translation fails as the guest (or userspace!) has + * managed to do something stupid. Emulated LPI injection will still + * work if the guest figures itself out at a later time. + */ + if (vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq)) + return 0; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* Silently exit if the vLPI is already mapped */ + if (irq->hw) + goto out_unlock_irq; /* * Emit the mapping request. If it fails, the ITS probably @@ -449,68 +484,72 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, ret = its_map_vlpi(virq, &map); if (ret) - goto out; + goto out_unlock_irq; irq->hw = true; irq->host_irq = virq; atomic_inc(&map.vpe->vlpi_count); /* Transfer pending state */ - raw_spin_lock_irqsave(&irq->irq_lock, flags); - if (irq->pending_latch) { - ret = irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - irq->pending_latch); - WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq); + if (!irq->pending_latch) + goto out_unlock_irq; - /* - * Clear pending_latch and communicate this state - * change via vgic_queue_irq_unlock. - */ - irq->pending_latch = false; - vgic_queue_irq_unlock(kvm, irq, flags); - } else { - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - } + ret = irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, + irq->pending_latch); + WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq); -out: - mutex_unlock(&its->its_lock); + /* + * Clear pending_latch and communicate this state + * change via vgic_queue_irq_unlock. + */ + irq->pending_latch = false; + vgic_queue_irq_unlock(kvm, irq, flags); + return ret; + +out_unlock_irq: + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); return ret; } -int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq, - struct kvm_kernel_irq_routing_entry *irq_entry) +static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq) { - struct vgic_its *its; struct vgic_irq *irq; - int ret; + unsigned long idx; - if (!vgic_supports_direct_msis(kvm)) - return 0; + guard(rcu)(); + xa_for_each(&kvm->arch.vgic.lpi_xa, idx, irq) { + if (!irq->hw || irq->host_irq != host_irq) + continue; - /* - * Get the ITS, and escape early on error (not a valid - * doorbell for any of our vITSs). - */ - its = vgic_get_its(kvm, irq_entry); - if (IS_ERR(its)) - return 0; + if (!vgic_try_get_irq_ref(irq)) + return NULL; - mutex_lock(&its->its_lock); + return irq; + } - ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, - irq_entry->msi.data, &irq); - if (ret) - goto out; + return NULL; +} - WARN_ON(!(irq->hw && irq->host_irq == virq)); +void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) +{ + struct vgic_irq *irq; + unsigned long flags; + + if (!vgic_supports_direct_msis(kvm)) + return; + + irq = __vgic_host_irq_get_vlpi(kvm, host_irq); + if (!irq) + return; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + WARN_ON(irq->hw && irq->host_irq != host_irq); if (irq->hw) { atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); irq->hw = false; - ret = its_unmap_vlpi(virq); + its_unmap_vlpi(host_irq); } -out: - mutex_unlock(&its->its_lock); - return ret; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(kvm, irq); } diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c new file mode 100644 index 000000000000..2d3811f4e117 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <kvm/arm_vgic.h> +#include <linux/irqchip/arm-vgic-info.h> + +#include "vgic.h" + +/* + * Probe for a vGICv5 compatible interrupt controller, returning 0 on success. + * Currently only supports GICv3-based VMs on a GICv5 host, and hence only + * registers a VGIC_V3 device. + */ +int vgic_v5_probe(const struct gic_kvm_info *info) +{ + u64 ich_vtr_el2; + int ret; + + if (!cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) + return -ENODEV; + + kvm_vgic_global_state.type = VGIC_V5; + kvm_vgic_global_state.has_gcie_v3_compat = true; + + /* We only support v3 compat mode - use vGICv3 limits */ + kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; + + kvm_vgic_global_state.vcpu_base = 0; + kvm_vgic_global_state.vctrl_base = NULL; + kvm_vgic_global_state.can_emulate_gicv2 = false; + kvm_vgic_global_state.has_gicv4 = false; + kvm_vgic_global_state.has_gicv4_1 = false; + + ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); + kvm_vgic_global_state.ich_vtr_el2 = (u32)ich_vtr_el2; + + /* + * The ListRegs field is 5 bits, but there is an architectural + * maximum of 16 list registers. Just ignore bit 4... + */ + kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; + + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); + if (ret) { + kvm_err("Cannot register GICv3-legacy KVM device.\n"); + return ret; + } + + static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); + kvm_info("GCIE legacy system register CPU interface\n"); + + return 0; +} diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index d97e6080b421..430aa98888fd 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -24,16 +24,23 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { /* * Locking order is always: * kvm->lock (mutex) - * its->cmd_lock (mutex) - * its->its_lock (mutex) - * vgic_cpu->ap_list_lock must be taken with IRQs disabled - * kvm->lpi_list_lock must be taken with IRQs disabled - * vgic_irq->irq_lock must be taken with IRQs disabled + * vcpu->mutex (mutex) + * kvm->arch.config_lock (mutex) + * its->cmd_lock (mutex) + * its->its_lock (mutex) + * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_cpu->ap_list_lock must be taken with IRQs disabled + * vgic_irq->irq_lock must be taken with IRQs disabled * * As the ap_list_lock might be taken from the timer interrupt handler, * we have to disable IRQs before taking this lock and everything lower * than it. * + * The config_lock has additional ordering requirements: + * kvm->slots_lock + * kvm->srcu + * kvm->arch.config_lock + * * If you need to take multiple locks, always take the upper lock first, * then the lower ones, e.g. first take the its_lock, then the irq_lock. * If you are already holding a lock and need to take a higher one, you @@ -52,32 +59,22 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { */ /* - * Iterate over the VM's list of mapped LPIs to find the one with a - * matching interrupt ID and return a reference to the IRQ structure. + * Index the VM's xarray of mapped LPIs and return a reference to the IRQ + * structure. The caller is expected to call vgic_put_irq() later once it's + * finished with the IRQ. */ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq = NULL; - unsigned long flags; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + rcu_read_lock(); - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - if (irq->intid != intid) - continue; + irq = xa_load(&dist->lpi_xa, intid); + if (!vgic_try_get_irq_ref(irq)) + irq = NULL; - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() later once it's finished with the IRQ. - */ - vgic_get_irq_kref(irq); - goto out_unlock; - } - irq = NULL; - -out_unlock: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + rcu_read_unlock(); return irq; } @@ -87,17 +84,11 @@ out_unlock: * struct vgic_irq. It also increases the refcount, so any caller is expected * to call vgic_put_irq() once it's finished with this IRQ. */ -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid) +struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid) { - /* SGIs and PPIs */ - if (intid <= VGIC_MAX_PRIVATE) { - intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); - return &vcpu->arch.vgic_cpu.private_irqs[intid]; - } - /* SPIs */ - if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { + if (intid >= VGIC_NR_PRIVATE_IRQS && + intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; } @@ -109,29 +100,42 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, return NULL; } -/* - * We can't do anything in here, because we lack the kvm pointer to - * lock and remove the item from the lpi_list. So we keep this function - * empty and use the return value of kref_put() to trigger the freeing. - */ -static void vgic_irq_release(struct kref *ref) +struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) { + if (WARN_ON(!vcpu)) + return NULL; + + /* SGIs and PPIs */ + if (intid < VGIC_NR_PRIVATE_IRQS) { + intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS); + return &vcpu->arch.vgic_cpu.private_irqs[intid]; + } + + return vgic_get_irq(vcpu->kvm, intid); } -/* - * Drop the refcount on the LPI. Must be called with lpi_list_lock held. - */ -void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq) +static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq) { - struct vgic_dist *dist = &kvm->arch.vgic; + lockdep_assert_held(&dist->lpi_xa.xa_lock); + __xa_erase(&dist->lpi_xa, irq->intid); + kfree_rcu(irq, rcu); +} - if (!kref_put(&irq->refcount, vgic_irq_release)) - return; +static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return false; - list_del(&irq->lpi_list); - dist->lpi_list_count--; + return refcount_dec_and_test(&irq->refcount); +} - kfree(irq); +static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq) +{ + if (!__vgic_put_irq(kvm, irq)) + return false; + + irq->pending_release = true; + return true; } void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) @@ -139,18 +143,44 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) struct vgic_dist *dist = &kvm->arch.vgic; unsigned long flags; - if (irq->intid < VGIC_MIN_LPI) + /* + * Normally the lock is only taken when the refcount drops to 0. + * Acquire/release it early on lockdep kernels to make locking issues + * in rare release paths a bit more obvious. + */ + if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) { + guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock); + } + + if (!__vgic_put_irq(kvm, irq)) return; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - __vgic_put_lpi_locked(kvm, irq); - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + xa_lock_irqsave(&dist->lpi_xa, flags); + vgic_release_lpi_locked(dist, irq); + xa_unlock_irqrestore(&dist->lpi_xa, flags); +} + +static void vgic_release_deleted_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long flags, intid; + struct vgic_irq *irq; + + xa_lock_irqsave(&dist->lpi_xa, flags); + + xa_for_each(&dist->lpi_xa, intid, irq) { + if (irq->pending_release) + vgic_release_lpi_locked(dist, irq); + } + + xa_unlock_irqrestore(&dist->lpi_xa, flags); } void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted = false; unsigned long flags; raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); @@ -161,11 +191,14 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) list_del(&irq->ap_list); irq->vcpu = NULL; raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); + deleted |= vgic_put_irq_norelease(vcpu->kvm, irq); } } raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + + if (deleted) + vgic_release_deleted_lpis(vcpu->kvm); } void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) @@ -201,7 +234,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) } /** - * kvm_vgic_target_oracle - compute the target vcpu for an irq + * vgic_target_oracle - compute the target vcpu for an irq * * @irq: The irq to route. Must be already locked. * @@ -211,7 +244,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) * * Requires the IRQ lock to be held. */ -static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) { lockdep_assert_held(&irq->irq_lock); @@ -239,17 +272,20 @@ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) return NULL; } +struct vgic_sort_info { + struct kvm_vcpu *vcpu; + struct vgic_vmcr vmcr; +}; + /* * The order of items in the ap_lists defines how we'll pack things in LRs as * well, the first items in the list being the first things populated in the * LRs. * - * A hard rule is that active interrupts can never be pushed out of the LRs - * (and therefore take priority) since we cannot reliably trap on deactivation - * of IRQs and therefore they have to be present in the LRs. - * + * Pending, non-active interrupts must be placed at the head of the list. * Otherwise things should be sorted by the priority field and the GIC * hardware support will take care of preemption of priority groups etc. + * Interrupts that are not deliverable should be at the end of the list. * * Return negative if "a" sorts before "b", 0 to preserve order, and positive * to sort "b" before "a". @@ -259,6 +295,8 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, { struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); + struct vgic_sort_info *info = priv; + struct kvm_vcpu *vcpu = info->vcpu; bool penda, pendb; int ret; @@ -272,21 +310,32 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, raw_spin_lock(&irqa->irq_lock); raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); - if (irqa->active || irqb->active) { - ret = (int)irqb->active - (int)irqa->active; + /* Undeliverable interrupts should be last */ + ret = (int)(vgic_target_oracle(irqb) == vcpu) - (int)(vgic_target_oracle(irqa) == vcpu); + if (ret) goto out; - } - penda = irqa->enabled && irq_is_pending(irqa); - pendb = irqb->enabled && irq_is_pending(irqb); + /* Same thing for interrupts targeting a disabled group */ + ret = (int)(irqb->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + ret -= (int)(irqa->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + if (ret) + goto out; + + penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active; + pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active; + + ret = (int)pendb - (int)penda; + if (ret) + goto out; - if (!penda || !pendb) { - ret = (int)pendb - (int)penda; + /* Both pending and enabled, sort by priority (lower number first) */ + ret = (int)irqa->priority - (int)irqb->priority; + if (ret) goto out; - } - /* Both pending and enabled, sort by priority */ - ret = irqa->priority - irqb->priority; + /* Finally, HW bit active interrupts have priority over non-HW ones */ + ret = (int)irqb->hw - (int)irqa->hw; + out: raw_spin_unlock(&irqb->irq_lock); raw_spin_unlock(&irqa->irq_lock); @@ -297,10 +346,12 @@ out: static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_sort_info info = { .vcpu = vcpu, }; lockdep_assert_held(&vgic_cpu->ap_list_lock); - list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); + vgic_get_vmcr(vcpu, &info.vmcr); + list_sort(&info, &vgic_cpu->ap_list_head, vgic_irq_cmp); } /* @@ -323,6 +374,20 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne return false; } +static bool vgic_model_needs_bcst_kick(struct kvm *kvm) +{ + /* + * A GICv3 (or GICv3-like) system exposing a GICv3 to the guest + * needs a broadcast kick to set TDIR globally. + * + * For systems that do not have TDIR (ARM's own v8.0 CPUs), the + * shadow TDIR bit is always set, and so is the register's TC bit, + * so no need to kick the CPUs. + */ + return (cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); +} + /* * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. * Do the queuing if necessary, taking the right locks in the right order. @@ -332,9 +397,10 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne * with all locks dropped. */ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags) + unsigned long flags) __releases(&irq->irq_lock) { struct kvm_vcpu *vcpu; + bool bcast; lockdep_assert_held(&irq->irq_lock); @@ -402,17 +468,27 @@ retry: /* * Grab a reference to the irq to reflect the fact that it is - * now in the ap_list. + * now in the ap_list. This is safe as the caller must already hold a + * reference on the irq. */ - vgic_get_irq_kref(irq); + vgic_get_irq_ref(irq); list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; + /* A new SPI may result in deactivation trapping on all vcpus */ + bcast = (vgic_model_needs_bcst_kick(vcpu->kvm) && + vgic_valid_spi(vcpu->kvm, irq->intid) && + atomic_fetch_inc(&vcpu->kvm->arch.vgic.active_spis) == 0); + raw_spin_unlock(&irq->irq_lock); raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); + if (!bcast) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } else { + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_IRQ_PENDING); + } return true; } @@ -420,7 +496,7 @@ retry: /** * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs + * @vcpu: The CPU for PPIs or NULL for global interrupts * @intid: The INTID to inject a new state to. * @level: Edge-triggered: true: to trigger the interrupt * false: to ignore the call @@ -434,25 +510,26 @@ retry: * level-sensitive interrupts. You can think of the level parameter as 1 * being HIGH and 0 being LOW and all devices being active-HIGH. */ -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level, void *owner) +int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int intid, bool level, void *owner) { - struct kvm_vcpu *vcpu; struct vgic_irq *irq; unsigned long flags; int ret; - trace_vgic_update_irq_pending(cpuid, intid, level); - ret = vgic_lazy_init(kvm); if (ret) return ret; - vcpu = kvm_get_vcpu(kvm, cpuid); if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS) return -EINVAL; - irq = vgic_get_irq(kvm, vcpu, intid); + trace_vgic_update_irq_pending(vcpu ? vcpu->vcpu_idx : 0, intid, level); + + if (intid < VGIC_NR_PRIVATE_IRQS) + irq = vgic_get_vcpu_irq(vcpu, intid); + else + irq = vgic_get_irq(kvm, intid); if (!irq) return -EINVAL; @@ -514,7 +591,7 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, u32 vintid, struct irq_ops *ops) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); unsigned long flags; int ret; @@ -539,7 +616,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, */ void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); unsigned long flags; if (!irq->hw) @@ -562,7 +639,7 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) if (!vgic_initialized(vcpu->kvm)) return -EAGAIN; - irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + irq = vgic_get_vcpu_irq(vcpu, vintid); BUG_ON(!irq); raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -573,6 +650,21 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) return 0; } +int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid) +{ + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); + unsigned long flags; + int ret = -1; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw) + ret = irq->hwintid; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + return ret; +} + /** * kvm_vgic_set_owner - Set the owner of an interrupt for a VM * @@ -596,7 +688,7 @@ int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid)) return -EINVAL; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + irq = vgic_get_vcpu_irq(vcpu, intid); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->owner && irq->owner != owner) ret = -EEXIST; @@ -619,6 +711,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted_lpis = false; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); @@ -646,12 +739,12 @@ retry: /* * This vgic_put_irq call matches the - * vgic_get_irq_kref in vgic_queue_irq_unlock, + * vgic_get_irq_ref in vgic_queue_irq_unlock, * where we added the LPI to the ap_list. As * we remove the irq from the list, we drop * also drop the refcount. */ - vgic_put_irq(vcpu->kvm, irq); + deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq); continue; } @@ -714,6 +807,9 @@ retry: } raw_spin_unlock(&vgic_cpu->ap_list_lock); + + if (unlikely(deleted_lpis)) + vgic_release_deleted_lpis(vcpu->kvm); } static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) @@ -744,98 +840,148 @@ static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) vgic_v3_clear_lr(vcpu, lr); } -static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_underflow(vcpu); - else - vgic_v3_set_underflow(vcpu); -} - -/* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu, - bool *multi_sgi) +static void summarize_ap_list(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; - int count = 0; - - *multi_sgi = false; lockdep_assert_held(&vgic_cpu->ap_list_lock); + *als = (typeof(*als)){}; + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - int w; + guard(raw_spinlock)(&irq->irq_lock); - raw_spin_lock(&irq->irq_lock); - /* GICv2 SGIs can count for more than one... */ - w = vgic_irq_get_lr_count(irq); - raw_spin_unlock(&irq->irq_lock); + if (unlikely(vgic_target_oracle(irq) != vcpu)) + continue; - count += w; - *multi_sgi |= (w > 1); + if (!irq->active) + als->nr_pend++; + else + als->nr_act++; + + if (irq->intid < VGIC_NR_SGIS) + als->nr_sgi++; } - return count; } -/* Requires the VCPU's ap_list_lock to be held. */ +/* + * Dealing with LR overflow is close to black magic -- dress accordingly. + * + * We have to present an almost infinite number of interrupts through a very + * limited number of registers. Therefore crucial decisions must be made to + * ensure we feed the most relevant interrupts into the LRs, and yet have + * some facilities to let the guest interact with those that are not there. + * + * All considerations below are in the context of interrupts targeting a + * single vcpu with non-idle state (either pending, active, or both), + * colloquially called the ap_list: + * + * - Pending interrupts must have priority over active interrupts. This also + * excludes pending+active interrupts. This ensures that a guest can + * perform priority drops on any number of interrupts, and yet be + * presented the next pending one. + * + * - Deactivation of interrupts outside of the LRs must be tracked by using + * either the EOIcount-driven maintenance interrupt, and sometimes by + * trapping the DIR register. + * + * - For EOImode=0, a non-zero EOIcount means walking the ap_list past the + * point that made it into the LRs, and deactivate interrupts that would + * have made it onto the LRs if we had the space. + * + * - The MI-generation bits must be used to try and force an exit when the + * guest has done enough changes to the LRs that we want to reevaluate the + * situation: + * + * - if the total number of pending interrupts exceeds the number of + * LR, NPIE must be set in order to exit once no pending interrupts + * are present in the LRs, allowing us to populate the next batch. + * + * - if there are active interrupts outside of the LRs, then LRENPIE + * must be set so that we exit on deactivation of one of these, and + * work out which one is to be deactivated. Note that this is not + * enough to deal with EOImode=1, see below. + * + * - if the overall number of interrupts exceeds the number of LRs, + * then UIE must be set to allow refilling of the LRs once the + * majority of them has been processed. + * + * - as usual, MI triggers are only an optimisation, since we cannot + * rely on the MI being delivered in timely manner... + * + * - EOImode=1 creates some additional problems: + * + * - deactivation can happen in any order, and we cannot rely on + * EOImode=0's coupling of priority-drop and deactivation which + * imposes strict reverse Ack order. This means that DIR must + * trap if we have active interrupts outside of the LRs. + * + * - deactivation of SPIs can occur on any CPU, while the SPI is only + * present in the ap_list of the CPU that actually ack-ed it. In that + * case, EOIcount doesn't provide enough information, and we must + * resort to trapping DIR even if we don't overflow the LRs. Bonus + * point for not trapping DIR when no SPIs are pending or active in + * the whole VM. + * + * - LPIs do not suffer the same problem as SPIs on deactivation, as we + * have to essentially discard the active state, see below. + * + * - Virtual LPIs have an active state (surprise!), which gets removed on + * priority drop (EOI). However, EOIcount doesn't get bumped when the LPI + * is not present in the LR (surprise again!). Special care must therefore + * be taken to remove the active state from any activated LPI when exiting + * from the guest. This is in a way no different from what happens on the + * physical side. We still rely on the running priority to have been + * removed from the APRs, irrespective of the LPI being present in the LRs + * or not. + * + * - Virtual SGIs directly injected via GICv4.1 must not affect EOIcount, as + * they are not managed in SW and don't have a true active state. So only + * set vSGIEOICount when no SGIs are in the ap_list. + * + * - GICv2 SGIs with multiple sources are injected one source at a time, as + * if they were made pending sequentially. This may mean that we don't + * always present the HPPI if other interrupts with lower priority are + * pending in the LRs. Big deal. + */ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct ap_list_summary als; struct vgic_irq *irq; - int count; - bool multi_sgi; - u8 prio = 0xff; - int i = 0; + int count = 0; lockdep_assert_held(&vgic_cpu->ap_list_lock); - count = compute_ap_list_depth(vcpu, &multi_sgi); - if (count > kvm_vgic_global_state.nr_lr || multi_sgi) - vgic_sort_ap_list(vcpu); + summarize_ap_list(vcpu, &als); - count = 0; + if (irqs_outside_lrs(&als)) + vgic_sort_ap_list(vcpu); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - raw_spin_lock(&irq->irq_lock); - - /* - * If we have multi-SGIs in the pipeline, we need to - * guarantee that they are all seen before any IRQ of - * lower priority. In that case, we need to filter out - * these interrupts by exiting early. This is easy as - * the AP list has been sorted already. - */ - if (multi_sgi && irq->priority > prio) { - _raw_spin_unlock(&irq->irq_lock); - break; + scoped_guard(raw_spinlock, &irq->irq_lock) { + if (likely(vgic_target_oracle(irq) == vcpu)) { + vgic_populate_lr(vcpu, irq, count++); + } } - if (likely(vgic_target_oracle(irq) == vcpu)) { - vgic_populate_lr(vcpu, irq, count++); - - if (irq->source) - prio = irq->priority; - } - - raw_spin_unlock(&irq->irq_lock); - - if (count == kvm_vgic_global_state.nr_lr) { - if (!list_is_last(&irq->ap_list, - &vgic_cpu->ap_list_head)) - vgic_set_underflow(vcpu); + if (count == kvm_vgic_global_state.nr_lr) break; - } } /* Nuke remaining LRs */ - for (i = count ; i < kvm_vgic_global_state.nr_lr; i++) + for (int i = count ; i < kvm_vgic_global_state.nr_lr; i++) vgic_clear_lr(vcpu, i); - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count; - else + vgic_v2_configure_hcr(vcpu, &als); + } else { vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count; + vgic_v3_configure_hcr(vcpu, &als); + } } static inline bool can_access_vgic_from_kernel(void) @@ -859,23 +1005,31 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu) /* Sync back the hardware VGIC state into our emulation after a guest's run. */ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - int used_lrs; - - /* An empty ap_list_head implies used_lrs == 0 */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) + /* If nesting, emulate the HW effect from L0 to L1 */ + if (vgic_state_is_nested(vcpu)) { + vgic_v3_sync_nested(vcpu); return; + } + + if (vcpu_has_nv(vcpu)) + vgic_v3_nested_update_mi(vcpu); if (can_access_vgic_from_kernel()) vgic_save_state(vcpu); - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; - else - used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; + vgic_fold_lr_state(vcpu); + vgic_prune_ap_list(vcpu); +} + +/* Sync interrupts that were deactivated through a DIR trap */ +void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu) +{ + unsigned long flags; - if (used_lrs) - vgic_fold_lr_state(vcpu); + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); vgic_prune_ap_list(vcpu); + local_irq_restore(flags); } static inline void vgic_restore_state(struct kvm_vcpu *vcpu) @@ -890,42 +1044,57 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu) void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) { /* - * If there are no virtual interrupts active or pending for this - * VCPU, then there is no work to do and we can bail out without - * taking any lock. There is a potential race with someone injecting - * interrupts to the VCPU, but it is a benign race as the VCPU will - * either observe the new interrupt before or after doing this check, - * and introducing additional synchronization mechanism doesn't change - * this. + * If in a nested state, we must return early. Two possibilities: * - * Note that we still need to go through the whole thing if anything - * can be directly injected (GICv4). + * - If we have any pending IRQ for the guest and the guest + * expects IRQs to be handled in its virtual EL2 mode (the + * virtual IMO bit is set) and it is not already running in + * virtual EL2 mode, then we have to emulate an IRQ + * exception to virtual EL2. + * + * We do that by placing a request to ourselves which will + * abort the entry procedure and inject the exception at the + * beginning of the run loop. + * + * - Otherwise, do exactly *NOTHING* apart from enabling the virtual + * CPU interface. The guest state is already loaded, and we can + * carry on with running it. + * + * If we have NV, but are not in a nested state, compute the + * maintenance interrupt state, as it may fire. */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && - !vgic_supports_direct_msis(vcpu->kvm)) + if (vgic_state_is_nested(vcpu)) { + if (kvm_vgic_vcpu_pending_irq(vcpu)) + kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu); + + vgic_v3_flush_nested(vcpu); return; + } + + if (vcpu_has_nv(vcpu)) + vgic_v3_nested_update_mi(vcpu); DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { - raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock) vgic_flush_lr_state(vcpu); - raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); - } if (can_access_vgic_from_kernel()) vgic_restore_state(vcpu); - if (vgic_supports_direct_msis(vcpu->kvm)) + if (vgic_supports_direct_irqs(vcpu->kvm)) vgic_v4_commit(vcpu); } void kvm_vgic_load(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_load(vcpu); else vgic_v3_load(vcpu); @@ -933,26 +1102,18 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) void kvm_vgic_put(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_put(vcpu); else vgic_v3_put(vcpu); } -void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu) -{ - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) - return; - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_vmcr_sync(vcpu); - else - vgic_v3_vmcr_sync(vcpu); -} - int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; @@ -1013,7 +1174,7 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) if (!vgic_initialized(vcpu->kvm)) return false; - irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + irq = vgic_get_vcpu_irq(vcpu, vintid); raw_spin_lock_irqsave(&irq->irq_lock, flags); map_is_active = irq->hw && irq->active; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 23e280fa0a16..5f0fc96b4dc2 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -6,6 +6,7 @@ #define __KVM_ARM_VGIC_NEW_H__ #include <linux/irqchip/arm-gic-common.h> +#include <asm/kvm_mmu.h> #define PRODUCT_ID_KVM 0x4b /* ASCII code K */ #define IMPLEMENTER_ARM 0x43b @@ -15,6 +16,7 @@ #define INTERRUPT_ID_BITS_SPIS 10 #define INTERRUPT_ID_BITS_ITS 16 +#define VGIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1) #define VGIC_PRI_BITS 5 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) @@ -62,6 +64,24 @@ KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) +#define KVM_ICC_SRE_EL2 (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE | \ + ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB) +#define KVM_ICH_VTR_EL2_RES0 (ICH_VTR_EL2_DVIM | \ + ICH_VTR_EL2_A3V | \ + ICH_VTR_EL2_IDbits) +#define KVM_ICH_VTR_EL2_RES1 ICH_VTR_EL2_nV4 + +static inline u64 kvm_get_guest_vtr_el2(void) +{ + u64 vtr; + + vtr = kvm_vgic_global_state.ich_vtr_el2; + vtr &= ~KVM_ICH_VTR_EL2_RES0; + vtr |= KVM_ICH_VTR_EL2_RES1; + + return vtr; +} + /* * As per Documentation/virt/kvm/devices/arm-vgic-its.rst, * below macros are defined for ITS table entry encoding. @@ -131,6 +151,35 @@ static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) return vgic_irq_get_lr_count(irq) > 1; } +static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, + const void *data, unsigned long len) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret; + + dist->table_write_in_progress = true; + ret = kvm_write_guest_lock(kvm, gpa, data, len); + dist->table_write_in_progress = false; + + return ret; +} + +void kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst); + +static inline u64 vgic_ich_hcr_trap_bits(void) +{ + u64 hcr; + + /* All the traps are in the bottom 16bits */ + asm volatile(ALTERNATIVE_CB("movz %0, #0\n", + ARM64_ALWAYS_SYSTEM, + kvm_compute_ich_hcr_trap_bits) + : "=r" (hcr)); + + return hcr; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC @@ -157,6 +206,51 @@ struct vgic_reg_attr { gpa_t addr; }; +struct its_device { + struct list_head dev_list; + + /* the head for the list of ITTEs */ + struct list_head itt_head; + u32 num_eventid_bits; + gpa_t itt_addr; + u32 device_id; +}; + +#define COLLECTION_NOT_MAPPED ((u32)~0) + +struct its_collection { + struct list_head coll_list; + + u32 collection_id; + u32 target_addr; +}; + +#define its_is_collection_mapped(coll) ((coll) && \ + ((coll)->target_addr != COLLECTION_NOT_MAPPED)) + +struct its_ite { + struct list_head ite_list; + + struct vgic_irq *irq; + struct its_collection *collection; + u32 event_id; +}; + +struct ap_list_summary { + unsigned int nr_pend; /* purely pending, not active */ + unsigned int nr_act; /* active, or active+pending */ + unsigned int nr_sgi; /* any SGI */ +}; + +#define irqs_outside_lrs(s) \ + (((s)->nr_pend + (s)->nr_act) > kvm_vgic_global_state.nr_lr) + +#define irqs_pending_outside_lrs(s) \ + ((s)->nr_pend > kvm_vgic_global_state.nr_lr) + +#define irqs_active_outside_lrs(s) \ + ((s)->nr_act && irqs_outside_lrs(s)) + int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr); int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, @@ -164,15 +258,15 @@ int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, const struct vgic_register_region * vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, int len); -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid); -void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq); +struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid); +struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq); bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags); + unsigned long flags) __releases(&irq->irq_lock); void vgic_kick_vcpus(struct kvm *kvm); void vgic_irq_handle_resampling(struct vgic_irq *irq, bool lr_deactivated, bool lr_pending); @@ -183,9 +277,9 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); -void vgic_v2_set_npie(struct kvm_vcpu *vcpu); +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); @@ -193,7 +287,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_enable(struct kvm_vcpu *vcpu); +void vgic_v2_reset(struct kvm_vcpu *vcpu); int vgic_v2_probe(const struct gic_kvm_info *info); int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, @@ -202,38 +296,45 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, void vgic_v2_init_lrs(void); void vgic_v2_load(struct kvm_vcpu *vcpu); void vgic_v2_put(struct kvm_vcpu *vcpu); -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); -static inline void vgic_get_irq_kref(struct vgic_irq *irq) +static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq) { + if (!irq) + return false; + if (irq->intid < VGIC_MIN_LPI) - return; + return true; + + return refcount_inc_not_zero(&irq->refcount); +} - kref_get(&irq->refcount); +static inline void vgic_get_irq_ref(struct vgic_irq *irq) +{ + WARN_ON_ONCE(!vgic_try_get_irq_ref(irq)); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); -void vgic_v3_set_npie(struct kvm_vcpu *vcpu); +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val); +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_enable(struct kvm_vcpu *vcpu); +void vgic_v3_reset(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); int vgic_v3_save_pending_tables(struct kvm *kvm); int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count); int vgic_register_redist_iodev(struct kvm_vcpu *vcpu); +void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu); bool vgic_v3_check_base(struct kvm *kvm); void vgic_v3_load(struct kvm_vcpu *vcpu); void vgic_v3_put(struct kvm_vcpu *vcpu); -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); @@ -248,6 +349,7 @@ int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr, bool is_write); int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); +const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz); int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, u32 intid, u32 *val); int kvm_register_vgic_device(unsigned long type); @@ -259,8 +361,7 @@ int vgic_init(struct kvm *kvm); void vgic_debug_init(struct kvm *kvm); void vgic_debug_destroy(struct kvm *kvm); -bool lock_all_vcpus(struct kvm *kvm); -void unlock_all_vcpus(struct kvm *kvm); +int vgic_v5_probe(const struct gic_kvm_info *info); static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) { @@ -300,7 +401,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index); -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg); +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg); bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); @@ -313,24 +414,57 @@ static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) } bool vgic_lpis_enabled(struct kvm_vcpu *vcpu); -int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr); int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq **irq); struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi); -void vgic_lpi_translation_cache_init(struct kvm *kvm); -void vgic_lpi_translation_cache_destroy(struct kvm *kvm); -void vgic_its_invalidate_cache(struct kvm *kvm); +void vgic_its_invalidate_all_caches(struct kvm *kvm); /* GICv4.1 MMIO interface */ int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq); int vgic_its_invall(struct kvm_vcpu *vcpu); +bool system_supports_direct_sgis(void); bool vgic_supports_direct_msis(struct kvm *kvm); +bool vgic_supports_direct_sgis(struct kvm *kvm); + +static inline bool vgic_supports_direct_irqs(struct kvm *kvm) +{ + return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm); +} + int vgic_v4_init(struct kvm *kvm); void vgic_v4_teardown(struct kvm *kvm); void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu); + +static inline bool kvm_has_gicv3(struct kvm *kvm) +{ + return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); +} + +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu); +void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); +void vgic_v3_load_nested(struct kvm_vcpu *vcpu); +void vgic_v3_put_nested(struct kvm_vcpu *vcpu); +void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu); +void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu); + +static inline bool vgic_is_v3_compat(struct kvm *kvm) +{ + return cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF) && + kvm_vgic_global_state.has_gcie_v3_compat; +} + +static inline bool vgic_is_v3(struct kvm *kvm) +{ + return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm); +} + +int vgic_its_debug_init(struct kvm_device *dev); +void vgic_its_debug_destroy(struct kvm_device *dev); + #endif diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index d78ae63d7c15..7fe8ba1a2851 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -16,7 +16,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> -unsigned int kvm_arm_vmid_bits; +unsigned int __ro_after_init kvm_arm_vmid_bits; static DEFINE_RAW_SPINLOCK(cpu_vmid_lock); static atomic64_t vmid_generation; @@ -47,7 +47,7 @@ static void flush_context(void) int cpu; u64 vmid; - bitmap_clear(vmid_map, 0, NUM_USER_VMIDS); + bitmap_zero(vmid_map, NUM_USER_VMIDS); for_each_possible_cpu(cpu) { vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0); @@ -172,7 +172,7 @@ void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) /* * Initialize the VMID allocator */ -int kvm_arm_vmid_alloc_init(void) +int __init kvm_arm_vmid_alloc_init(void) { kvm_arm_vmid_bits = kvm_get_vmid_bits(); @@ -182,15 +182,14 @@ int kvm_arm_vmid_alloc_init(void) */ WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus()); atomic64_set(&vmid_generation, VMID_FIRST_VERSION); - vmid_map = kcalloc(BITS_TO_LONGS(NUM_USER_VMIDS), - sizeof(*vmid_map), GFP_KERNEL); + vmid_map = bitmap_zalloc(NUM_USER_VMIDS, GFP_KERNEL); if (!vmid_map) return -ENOMEM; return 0; } -void kvm_arm_vmid_alloc_free(void) +void __init kvm_arm_vmid_alloc_free(void) { - kfree(vmid_map); + bitmap_free(vmid_map); } |
